Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
P
purescript-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
153
Issues
153
List
Board
Labels
Milestones
Merge Requests
3
Merge Requests
3
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
purescript-gargantext
Commits
03ffdda9
Commit
03ffdda9
authored
Nov 12, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[TEXT][PARSER][XML] Issue.
parent
5c8e2fc5
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
142 additions
and
30 deletions
+142
-30
PubMed.hs
src/Gargantext/Text/Parsers/PubMed.hs
+102
-0
Wikimedia.hs
src/Gargantext/Text/Parsers/Wikimedia.hs
+40
-30
No files found.
src/Gargantext/Text/Parsers/PubMed.hs
0 → 100644
View file @
03ffdda9
{-|
Module : Gargantext.Text.Parsers.PubMed
Description : Parser for Wikimedia dump
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
@Gargantext.Text.Parsers.Wikimedia@:
This module provide a parser for wikipedia dump.
This include an xml parser for wikipedia's xml
and an wikimedia to plaintext converter for the wikipedia text field
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
module
Gargantext.Text.Parsers.PubMed
where
{-
import Data.Conduit
import Data.XML.Types (Event, Name)
import Text.Pandoc
import Data.Text as T
import Data.Either
-}
import
Control.Monad
(
join
)
import
GHC.IO
(
FilePath
)
import
Prelude
(
read
)
import
Gargantext.Prelude
import
Control.Applicative
((
<*
))
import
Control.Monad.Catch
(
MonadThrow
)
import
Data.Maybe
import
Data.Monoid
(
mconcat
)
import
Text.XML.Stream.Parse
import
Data.Conduit
(
runConduit
,
(
.|
),
ConduitT
)
import
Data.Text
(
Text
,
unpack
)
import
Data.XML.Types
(
Event
)
import
Data.ByteString
(
ByteString
)
import
qualified
Data.ByteString.Lazy
as
DBL
import
Gargantext.Text.Parsers.Wikimedia
issueXml
::
Maybe
[
PubMedArticle
]
issueXml
=
pubMedParser
pubMedData
data
PubMedArticle
=
PubMedArticle
{
pubmed_title
::
Maybe
Text
,
pubmed_journal
::
Maybe
Text
}
deriving
(
Show
)
readPubMedFile
::
FilePath
->
IO
(
Maybe
[
PubMedArticle
])
readPubMedFile
fp
=
do
input
<-
DBL
.
readFile
fp
pure
$
pubMedParser
input
pubMedParser
::
DBL
.
ByteString
->
Maybe
[
PubMedArticle
]
pubMedParser
bstring
=
runConduit
$
parseLBS
def
bstring
.|
force
"Pubmed"
parseArticles
parseArticles
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
[
PubMedArticle
])
parseArticles
=
tagIgnoreAttrs
"PubmedArticleSet"
$
many
parseArticle
parseArticle
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
PubMedArticle
)
parseArticle
=
tagIgnoreAttrs
"PubmedArticle"
parseMedlineCitation
parseMedlineCitation
::
MonadThrow
m
=>
ConduitT
Event
o
m
PubMedArticle
parseMedlineCitation
=
force
"medlineCitation"
$
tagIgnoreAttrs
"MedlineCitation"
$
do
_
<-
manyTagsUntil_
"Article"
journal
<-
tagIgnoreAttrs
"Journal"
$
force
"journal"
$
manyTagsUntil
"Title"
content
title
<-
manyTagsUntil
"ArticleTitle"
$
force
"title"
$
manyTagsUntil
"ArticleTitle"
content
_
<-
many
$
ignoreAnyTreeContent
return
$
PubMedArticle
title
journal
pubMedData
::
DBL
.
ByteString
pubMedData
=
mconcat
[
"<?xml version=
\"
1.0
\"
?>"
,
"<!DOCTYPE PubmedArticleSet PUBLIC
\"
-//NLM//DTD PubMedArticle, 1st June 2018//EN
\"
\"
https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd
\"
>"
,
"<PubmedArticleSet>"
,
"<PubmedArticle>"
,
"<MedlineCitation Status=
\"
Publisher
\"
Owner=
\"
NLM
\"
>"
,
" <PMID Version=
\"
1
\"
>30357468</PMID>"
,
" <DateRevised>"
,
" <Year>2018</Year>"
,
" </DateRevised>"
,
" <Article PubModel=
\"
Print-Electronic
\"
>"
,
" <Journal>"
,
" <ISSN IssnType=
\"
Electronic
\"
>1432-1076</ISSN>"
,
" <Title>European journal of pediatrics</Title>"
,
" </Journal>"
,
" <ArticleTitle>European journal of pediatrics</ArticleTitle>"
,
" </Article>"
,
"</MedlineCitation>"
,
"</PubmedArticle>"
,
"</PubmedArticleSet>"
]
src/Gargantext/Text/Parsers/Wikimedia.hs
View file @
03ffdda9
...
...
@@ -16,15 +16,17 @@ and an wikimedia to plaintext converter for the wikipedia text field
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
module
Gargantext.Text.Parsers.Wikimedia
where
import
Gargantext.Prelud
e
import
Text.XML.Stream.Parse
module
Gargantext.Text.Parsers.Wikimedia
wher
e
import
Control.Monad.Catch
import
Data.Conduit
import
Data.Either
import
Data.Text
as
T
import
Data.XML.Types
(
Event
,
Name
)
import
Gargantext.Prelude
import
Text.Pandoc
import
Data.Text
as
T
import
Data.Either
import
Text.XML.Stream.Parse
-- | Use case
-- :{
...
...
@@ -38,9 +40,8 @@ import Data.Either
-- | A simple "Page" type.
-- For the moment it takes only text and title
-- (since there is no abstract) will see if other data are relevant.
data
Page
=
Page
{
_markupFormat
::
MarkupFormat
data
Page
=
Page
{
_markupFormat
::
MarkupFormat
,
_title
::
Maybe
T
.
Text
,
_text
::
Maybe
T
.
Text
}
...
...
@@ -50,40 +51,49 @@ data MarkupFormat = Mediawiki | Plaintext
deriving
(
Show
)
parseRevision
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
T
.
Text
)
parseRevision
=
tagNoAttr
"{http://www.mediawiki.org/xml/export-0.10/}revision"
$
do
text
<-
force
"text is missing"
$
ignoreExcept
"{http://www.mediawiki.org/xml/export-0.10/}text"
content
many_
$
ignoreAnyTreeContent
parseRevision
=
tagNoAttr
"{http://www.mediawiki.org/xml/export-0.10/}revision"
$
do
text
<-
force
"text is missing"
$
ignoreExcept
"{http://www.mediawiki.org/xml/export-0.10/}text"
content
many_
ignoreAnyTreeContent
return
text
-- | Utility function that match everything but the tag given
-- | Utility function that match
es
everything but the tag given
tagUntil
::
Name
->
NameMatcher
Name
tagUntil
name
=
matching
(
/=
name
)
-- | Utility function that parse nothing but the tag given,
-- | Utility function that consumes everything but the tag given
-- usefull because we have to consume every data.
manyTagsUntil_
::
MonadThrow
m
=>
Name
->
ConduitT
Event
o
m
()
manyTagsUntil_
=
many_
.
ignoreTreeContent
.
tagUntil
manyTagsUntil_'
::
MonadThrow
m
=>
Name
->
ConduitT
Event
o
m
()
manyTagsUntil_'
=
many_
.
ignoreTag
.
tagUntil
-- | Utility function that parses nothing but the tag given,
-- usefull because we have to consume every data.
ignoreExcept
::
MonadThrow
m
=>
Name
->
ConduitT
Event
o
m
b
->
ConduitT
Event
o
m
(
Maybe
b
)
ignoreExcept
name
f
=
do
_
<-
consumeExcept
name
tagIgnoreAttrs
(
matching
(
==
name
))
f
_
<-
manyTagsUntil_
name
tagIgnoreAttrs
(
matching
(
==
name
))
f
-- TODO: remove ignoreExcept to:
-- many ignoreAnyTreeContentUntil "Article"
manyTagsUntil
::
MonadThrow
m
=>
Name
->
ConduitT
Event
o
m
b
->
ConduitT
Event
o
m
(
Maybe
b
)
manyTagsUntil
name
f
=
do
_
<-
manyTagsUntil_
name
tagIgnoreAttrs
(
matching
(
==
name
))
f
-- | Utility function that consume everything but the tag given
-- usefull because we have to consume every data.
consumeExcept
::
MonadThrow
m
=>
Name
->
ConduitT
Event
o
m
()
consumeExcept
=
many_
.
ignoreTreeContent
.
tagUntil
parsePage
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
Page
)
parsePage
=
tagNoAttr
"{http://www.mediawiki.org/xml/export-0.10/}page"
$
do
title
<-
tagNoAttr
"{http://www.mediawiki.org/xml/export-0.10/}title"
content
_
<-
consumeExcept
"{http://www.mediawiki.org/xml/export-0.10/}revision"
_
<-
manyTagsUntil_
"{http://www.mediawiki.org/xml/export-0.10/}revision"
revision
<-
parseRevision
many_
$
ignoreAnyTreeContent
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment