Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Julien Moutinho
haskell-gargantext
Commits
c651ce24
Commit
c651ce24
authored
Oct 09, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/lang-parser'
parents
9f5a6d1c
f7791341
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
117 additions
and
0 deletions
+117
-0
package.yaml
package.yaml
+4
-0
Wikimedia.hs
src/Gargantext/Text/Parsers/Wikimedia.hs
+113
-0
No files found.
package.yaml
View file @
c651ce24
...
...
@@ -52,6 +52,7 @@ library:
-
Gargantext.Text.Metrics.Count
-
Gargantext.Text.Parsers.CSV
-
Gargantext.Text.Parsers.Date
-
Gargantext.Text.Parsers.Wikimedia
-
Gargantext.Text.Parsers.WOS
-
Gargantext.Text.Search
-
Gargantext.Text.Terms
...
...
@@ -111,6 +112,7 @@ library:
-
mtl
-
natural-transformation
-
opaleye
-
pandoc
-
parsec
-
path
-
path-io
...
...
@@ -155,6 +157,8 @@ library:
-
wai-cors
-
wai-extra
-
warp
-
xml-conduit
-
xml-types
-
yaml
-
zip
-
zlib
...
...
src/Gargantext/Text/Parsers/Wikimedia.hs
0 → 100644
View file @
c651ce24
{-|
Module : Gargantext.Text.Parsers.WOS
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
@Gargantext.Text.Parsers.Wikimedia@:
This module provide a parser for wikipedia dump.
This include an xml parser for wikipedia's xml
and an wikimedia to plaintext converter for the wikipedia text field
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
module
Gargantext.Text.Parsers.Wikimedia
where
import
Gargantext.Prelude
import
Text.XML.Stream.Parse
import
Control.Monad.Catch
import
Data.Conduit
import
Data.XML.Types
(
Event
,
Name
)
import
Text.Pandoc
import
Data.Text
as
T
import
Data.Either
-- | Use case
-- >>> :{
-- wikimediaFile <- BL.readFile "text.xml"
-- _ <- runConduit $ parseLBS def wikimediaFile
-- .| force "mediawiki required" parseMediawiki
-- .| CL.mapM mediawikiPageToPlain
-- .| CL.mapM_ print
-- :}
-- | A simple "Page" type.
-- For the moment it take only text and title
-- (since there is no abstract) will see if other datas are relevant.
data
Page
=
Page
{
_markupFormat
::
MarkupFormat
,
_title
::
Maybe
T
.
Text
,
_text
::
Maybe
T
.
Text
}
deriving
(
Show
)
data
MarkupFormat
=
Mediawiki
|
Plaintext
deriving
(
Show
)
parseRevision
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
T
.
Text
)
parseRevision
=
tagNoAttr
"{http://www.mediawiki.org/xml/export-0.10/}revision"
$
do
text
<-
force
"text is missing"
$
ignoreExcept
"{http://www.mediawiki.org/xml/export-0.10/}text"
content
many_
$
ignoreAnyTreeContent
return
text
-- | Utility function that match everything but the tag given
tagUntil
::
Name
->
NameMatcher
Name
tagUntil
name
=
matching
(
/=
name
)
-- | Utility function that parse nothing but the tag given,
-- usefull because we have to consume every data.
ignoreExcept
::
MonadThrow
m
=>
Name
->
ConduitT
Event
o
m
b
->
ConduitT
Event
o
m
(
Maybe
b
)
ignoreExcept
name
f
=
do
_
<-
consumeExcept
name
tagIgnoreAttrs
(
matching
(
==
name
))
f
-- | Utility function that consume everything but the tag given
-- usefull because we have to consume every data.
consumeExcept
::
MonadThrow
m
=>
Name
->
ConduitT
Event
o
m
()
consumeExcept
=
many_
.
ignoreTreeContent
.
tagUntil
parsePage
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
Page
)
parsePage
=
tagNoAttr
"{http://www.mediawiki.org/xml/export-0.10/}page"
$
do
title
<-
tagNoAttr
"{http://www.mediawiki.org/xml/export-0.10/}title"
content
_
<-
consumeExcept
"{http://www.mediawiki.org/xml/export-0.10/}revision"
revision
<-
parseRevision
many_
$
ignoreAnyTreeContent
return
$
Page
Mediawiki
title
revision
parseMediawiki
::
MonadThrow
m
=>
ConduitT
Event
Page
m
(
Maybe
()
)
parseMediawiki
=
tagIgnoreAttrs
"{http://www.mediawiki.org/xml/export-0.10/}mediawiki"
$
manyYield'
parsePage
-- | Convert a Mediawiki Page to a Plaintext Page.
-- Need to wrap the result in IO to parse and to combine it.
mediawikiPageToPlain
::
Page
->
IO
Page
mediawikiPageToPlain
page
=
do
title
<-
mediaToPlain
$
_title
page
revision
<-
mediaToPlain
$
_text
page
return
$
Page
Plaintext
title
revision
where
mediaToPlain
media
=
case
media
of
(
Nothing
)
->
return
Nothing
(
Just
med
)
->
do
res
<-
runIO
$
do
doc
<-
readMediaWiki
def
med
writePlain
def
doc
case
res
of
(
Left
_
)
->
return
Nothing
(
Right
r
)
->
return
$
Just
r
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment