Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
199
Issues
199
List
Board
Labels
Milestones
Merge Requests
12
Merge Requests
12
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
681674f6
Commit
681674f6
authored
Nov 12, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[TEXT][PARSER][PUBMED] PubDate or ArticleDate are not reliable.
parent
03ffdda9
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
118 additions
and
55 deletions
+118
-55
Parsers.hs
src/Gargantext/Text/Parsers.hs
+1
-1
PubMed.hs
src/Gargantext/Text/Parsers/PubMed.hs
+116
-53
Wikimedia.hs
src/Gargantext/Text/Parsers/Wikimedia.hs
+1
-1
No files found.
src/Gargantext/Text/Parsers.hs
View file @
681674f6
...
@@ -29,9 +29,9 @@ import System.FilePath (FilePath(), takeExtension)
...
@@ -29,9 +29,9 @@ import System.FilePath (FilePath(), takeExtension)
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
Control.Monad
(
join
)
import
Control.Monad
(
join
)
import
Data.Time
(
UTCTime
(
..
))
import
qualified
Data.Time
as
DT
import
qualified
Data.Time
as
DT
import
Data.Either.Extra
(
partitionEithers
)
import
Data.Either.Extra
(
partitionEithers
)
import
Data.Time
(
UTCTime
(
..
))
import
Data.List
(
concat
)
import
Data.List
(
concat
)
import
qualified
Data.Map
as
DM
import
qualified
Data.Map
as
DM
import
qualified
Data.ByteString
as
DB
import
qualified
Data.ByteString
as
DB
...
...
src/Gargantext/Text/Parsers/PubMed.hs
View file @
681674f6
...
@@ -7,10 +7,7 @@ Maintainer : team@gargantext.org
...
@@ -7,10 +7,7 @@ Maintainer : team@gargantext.org
Stability : experimental
Stability : experimental
Portability : POSIX
Portability : POSIX
@Gargantext.Text.Parsers.Wikimedia@:
This module provide a parser for wikipedia dump.
This include an xml parser for wikipedia's xml
and an wikimedia to plaintext converter for the wikipedia text field
-}
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE OverloadedStrings #-}
...
@@ -18,85 +15,151 @@ and an wikimedia to plaintext converter for the wikipedia text field
...
@@ -18,85 +15,151 @@ and an wikimedia to plaintext converter for the wikipedia text field
module
Gargantext.Text.Parsers.PubMed
where
module
Gargantext.Text.Parsers.PubMed
where
{-
import Data.Conduit
import Data.XML.Types (Event, Name)
import Text.Pandoc
import Data.Text as T
import Data.Either
-}
import
Control.Monad
(
void
)
import
Data.Conduit.List
as
CL
hiding
(
catMaybes
)
import
Control.Monad
(
join
)
import
Control.Monad
(
join
)
import
GHC.IO
(
FilePath
)
import
GHC.IO
(
FilePath
)
import
Prelude
(
read
)
import
Prelude
(
read
,
print
)
import
Gargantext.Prelude
import
Gargantext.Prelude
import
Control.Applicative
((
<*
))
import
Control.Applicative
((
<*
))
import
Control.Monad.Catch
(
MonadThrow
)
import
Control.Monad.Catch
(
MonadThrow
)
import
Data.Maybe
import
Data.Maybe
(
Maybe
,
catMaybes
)
import
Data.Monoid
(
mconcat
)
import
Data.Monoid
(
mconcat
)
import
Text.XML.Stream.Parse
import
Text.XML.Stream.Parse
import
Data.Conduit
(
runConduit
,
(
.|
),
ConduitT
)
import
Data.Conduit
(
runConduit
,
(
.|
),
ConduitT
)
import
Data.Text
(
Text
,
unpack
)
import
Data.Text
(
Text
,
unpack
,
concat
)
import
Data.XML.Types
(
Event
)
import
Data.XML.Types
(
Event
)
import
Data.ByteString
(
ByteString
)
import
Data.ByteString
(
ByteString
)
import
Data.Time.Segment
(
jour
)
import
Data.Time
(
UTCTime
(
..
))
import
qualified
Data.ByteString.Lazy
as
DBL
import
qualified
Data.ByteString.Lazy
as
DBL
import
Gargantext.Text.Parsers.Wikimedia
import
Gargantext.Text.Parsers.Wikimedia
issueXml
::
Maybe
[
PubMedArticle
]
issueXml
=
pubMedParser
pubMedData
data
PubMedArticle
=
data
PubMedArticle
=
PubMedArticle
{
pubmed_title
::
Maybe
Text
PubMedArticle
{
pubmed_title
::
Maybe
Text
,
pubmed_journal
::
Maybe
Text
,
pubmed_journal
::
Maybe
Text
,
pubmed_abstract
::
Maybe
[
Text
]
,
pubmed_date
::
UTCTime
,
pubmed_year
::
Integer
,
pubmed_month
::
Int
,
pubmed_day
::
Int
}
}
deriving
(
Show
)
deriving
(
Show
)
readPubMedFile
::
FilePath
->
IO
(
Maybe
[
PubMedArticle
]
)
readPubMedFile
::
FilePath
->
IO
()
readPubMedFile
fp
=
do
readPubMedFile
fp
=
do
input
<-
DBL
.
readFile
fp
input
<-
DBL
.
readFile
fp
pure
$
pubMedParser
input
pubMedParser
input
pubMedParser
::
DBL
.
ByteString
->
IO
()
pubMedParser
bstring
=
runConduit
$
parseLBS
def
bstring
.|
parseArticleSet
.|
CL
.
mapM_
print
--parseArticleSet :: MonadThrow m => ConduitT Event o m [PubMedArticle]
parseArticleSet
=
do
as
<-
force
"force"
$
tagIgnoreAttrs
"PubmedArticleSet"
$
manyYield
parsePubMedArticle
-- _ <- many $ ignoreAnyTreeContent
return
as
parsePubMedArticle
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
PubMedArticle
)
parsePubMedArticle
=
do
articles
<-
force
"PubmedArticle"
$
tagIgnoreAttrs
"PubmedArticle"
parsePubMedArticle'
--
_
<-
many
$
ignoreAnyTreeContent
return
articles
parsePubMedArticle'
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
PubMedArticle
)
parsePubMedArticle'
=
do
pubmed_article
<-
tagIgnoreAttrs
"MedlineCitation"
parseMedlineCitation
--
_
<-
tagIgnoreAttrs
"PubmedData"
content
_
<-
many
$
ignoreAnyTreeContent
return
pubmed_article
pubMedParser
::
DBL
.
ByteString
->
Maybe
[
PubMedArticle
]
parseMedlineCitation
::
MonadThrow
m
=>
ConduitT
Event
o
m
PubMedArticle
pubMedParser
bstring
=
runConduit
$
parseLBS
def
bstring
.|
force
"Pubmed"
parseArticles
parseMedlineCitation
=
do
a
<-
force
"article"
$
manyTagsUntil
"Article"
parseArticle
_
<-
many
$
ignoreAnyTreeContent
return
a
parseArticle
::
MonadThrow
m
=>
ConduitT
Event
o
m
PubMedArticle
parseArticle
=
do
(
journal
,
maybePubDate
)
<-
force
"journal"
$
manyTagsUntil
"Journal"
$
do
maybePubDate'
<-
manyTagsUntil
"JournalIssue"
$
do
maybePubDate''
<-
manyTagsUntil
"PubDate"
$
do
y
<-
tagIgnoreAttrs
"Year"
content
m
<-
tagIgnoreAttrs
"Month"
content
d
<-
tagIgnoreAttrs
"Day"
content
return
(
y
,
m
,
d
)
return
maybePubDate''
j
<-
manyTagsUntil
"Title"
content
_
<-
many
$
ignoreAnyTreeContent
return
(
j
,
join
maybePubDate'
)
parseArticles
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
[
PubMedArticle
])
title
<-
do
parseArticles
=
tagIgnoreAttrs
"PubmedArticleSet"
$
many
parseArticle
t
<-
manyTagsUntil
"ArticleTitle"
content
return
t
parseArticle
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
PubMedArticle
)
abstracts
<-
do
parseArticle
=
tagIgnoreAttrs
"PubmedArticle"
parseMedlineCitation
as
<-
manyTagsUntil
"Abstract"
$
many
$
do
txt
<-
tagIgnoreAttrs
"AbstractText"
$
do
c
<-
content
_
<-
many
$
ignoreAnyTreeContent
return
c
_
<-
many
$
ignoreAnyTreeContent
return
txt
return
as
-- TODO add authos
(
year
,
month
,
day
)
<-
case
maybePubDate
of
Nothing
->
force
"ArticleDate"
$
manyTagsUntil
"ArticleDate"
$
do
y
<-
force
"Year"
$
tagIgnoreAttrs
"Year"
content
m
<-
force
"Month"
$
tagIgnoreAttrs
"Month"
content
d
<-
force
"Day"
$
tagIgnoreAttrs
"Day"
content
return
(
read
$
unpack
y
,
read
$
unpack
m
,
read
$
unpack
d
)
Just
(
Just
y
,
Just
m
,
Just
d
)
->
return
(
read
$
unpack
"1"
,
read
$
unpack
"3"
,
read
$
unpack
"3"
)
_
->
panic
"error date"
parseMedlineCitation
::
MonadThrow
m
=>
ConduitT
Event
o
m
PubMedArticle
parseMedlineCitation
=
force
"medlineCitation"
$
tagIgnoreAttrs
"MedlineCitation"
$
do
_
<-
manyTagsUntil_
"Article"
journal
<-
tagIgnoreAttrs
"Journal"
$
force
"journal"
$
manyTagsUntil
"Title"
content
title
<-
manyTagsUntil
"ArticleTitle"
$
force
"title"
$
manyTagsUntil
"ArticleTitle"
content
_
<-
many
$
ignoreAnyTreeContent
_
<-
many
$
ignoreAnyTreeContent
return
$
PubMedArticle
title
journal
return
$
PubMedArticle
title
journal
abstracts
(
jour
year
month
day
)
year
month
day
pubMedData
::
DBL
.
ByteString
pubMedData
::
DBL
.
ByteString
pubMedData
=
mconcat
pubMedData
=
mconcat
[
"<?xml version=
\"
1.0
\"
?>"
[
"<?xml version=
\"
1.0
\"
?>
\n
"
,
"<!DOCTYPE PubmedArticleSet PUBLIC
\"
-//NLM//DTD PubMedArticle, 1st June 2018//EN
\"
\"
https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd
\"
>"
,
"<!DOCTYPE PubmedArticleSet PUBLIC
\"
-//NLM//DTD PubMedArticle, 1st June 2018//EN
\"
\"
https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd
\"
>
\n
"
,
"<PubmedArticleSet>"
,
"<PubmedArticleSet>
\n
"
,
"<PubmedArticle>"
,
"<PubmedArticle>
\n
"
,
"<MedlineCitation Status=
\"
Publisher
\"
Owner=
\"
NLM
\"
>"
,
" <MedlineCitation Status=
\"
Publisher
\"
Owner=
\"
NLM
\"
>
\n
"
,
" <PMID Version=
\"
1
\"
>30357468</PMID>"
,
" <PMID Version=
\"
1
\"
>30357468</PMID>
\n
"
,
" <DateRevised>"
,
" <DateRevised>
\n
"
,
" <Year>2018</Year>"
,
" <Year>2018</Year>
\n
"
,
" </DateRevised>"
,
" </DateRevised>
\n
"
,
" <Article PubModel=
\"
Print-Electronic
\"
>"
,
" <Article PubModel=
\"
Print-Electronic
\"
>
\n
"
,
" <Journal>"
,
" <Journal>
\n
"
,
" <ISSN IssnType=
\"
Electronic
\"
>1432-1076</ISSN>"
,
" <ISSN IssnType=
\"
Electronic
\"
>1432-1076</ISSN>
\n
"
,
" <Title>European journal of pediatrics</Title>"
,
" <Title>European journal of pediatrics</Title>
\n
"
,
" </Journal>"
,
" </Journal>
\n
"
,
" <ArticleTitle>European journal of pediatrics</ArticleTitle>"
,
" <ArticleTitle>Title of the Article</ArticleTitle>
\n
"
,
" </Article>"
,
" <ELocationID EIdType=
\"
doi
\"
ValidYN=
\"
Y
\"
>10.1007/s00431-018-3270-3</ELocationID>
\n
"
,
"</MedlineCitation>"
,
" <Abstract>
\n
"
,
"</PubmedArticle>"
,
" <AbstractText>Abstract Text.</AbstractText>
\n
"
,
"</PubmedArticleSet>"
,
" </Abstract>
\n
"
,
" <AuthorList>
\n
"
,
" </AuthorList>
\n
"
,
" </Article>
\n
"
,
" </MedlineCitation>
\n
"
,
" <PubmedData>
\n
"
,
" <History>
\n
"
,
" </History>
\n
"
,
" </PubmedData>
\n
"
,
"</PubmedArticle>
\n
"
,
"</PubmedArticleSet>
\n
"
]
]
src/Gargantext/Text/Parsers/Wikimedia.hs
View file @
681674f6
...
@@ -66,7 +66,7 @@ manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
...
@@ -66,7 +66,7 @@ manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_
=
many_
.
ignoreTreeContent
.
tagUntil
manyTagsUntil_
=
many_
.
ignoreTreeContent
.
tagUntil
manyTagsUntil_'
::
MonadThrow
m
=>
Name
->
ConduitT
Event
o
m
()
manyTagsUntil_'
::
MonadThrow
m
=>
Name
->
ConduitT
Event
o
m
()
manyTagsUntil_'
=
many_
.
ignoreTag
.
tagUntil
manyTagsUntil_'
=
many_
.
ignore
Empty
Tag
.
tagUntil
-- | Utility function that parses nothing but the tag given,
-- | Utility function that parses nothing but the tag given,
-- usefull because we have to consume every data.
-- usefull because we have to consume every data.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment