Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
198
Issues
198
List
Board
Labels
Milestones
Merge Requests
12
Merge Requests
12
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
681674f6
Commit
681674f6
authored
Nov 12, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[TEXT][PARSER][PUBMED] PubDate or ArticleDate are not reliable.
parent
03ffdda9
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
118 additions
and
55 deletions
+118
-55
Parsers.hs
src/Gargantext/Text/Parsers.hs
+1
-1
PubMed.hs
src/Gargantext/Text/Parsers/PubMed.hs
+116
-53
Wikimedia.hs
src/Gargantext/Text/Parsers/Wikimedia.hs
+1
-1
No files found.
src/Gargantext/Text/Parsers.hs
View file @
681674f6
...
...
@@ -29,9 +29,9 @@ import System.FilePath (FilePath(), takeExtension)
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
Control.Monad
(
join
)
import
Data.Time
(
UTCTime
(
..
))
import
qualified
Data.Time
as
DT
import
Data.Either.Extra
(
partitionEithers
)
import
Data.Time
(
UTCTime
(
..
))
import
Data.List
(
concat
)
import
qualified
Data.Map
as
DM
import
qualified
Data.ByteString
as
DB
...
...
src/Gargantext/Text/Parsers/PubMed.hs
View file @
681674f6
...
...
@@ -7,10 +7,7 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
@Gargantext.Text.Parsers.Wikimedia@:
This module provide a parser for wikipedia dump.
This include an xml parser for wikipedia's xml
and an wikimedia to plaintext converter for the wikipedia text field
-}
{-# LANGUAGE OverloadedStrings #-}
...
...
@@ -18,85 +15,151 @@ and an wikimedia to plaintext converter for the wikipedia text field
module
Gargantext.Text.Parsers.PubMed
where
{-
import Data.Conduit
import Data.XML.Types (Event, Name)
import Text.Pandoc
import Data.Text as T
import Data.Either
-}
import
Control.Monad
(
void
)
import
Data.Conduit.List
as
CL
hiding
(
catMaybes
)
import
Control.Monad
(
join
)
import
GHC.IO
(
FilePath
)
import
Prelude
(
read
)
import
Prelude
(
read
,
print
)
import
Gargantext.Prelude
import
Control.Applicative
((
<*
))
import
Control.Monad.Catch
(
MonadThrow
)
import
Data.Maybe
import
Data.Maybe
(
Maybe
,
catMaybes
)
import
Data.Monoid
(
mconcat
)
import
Text.XML.Stream.Parse
import
Data.Conduit
(
runConduit
,
(
.|
),
ConduitT
)
import
Data.Text
(
Text
,
unpack
)
import
Data.Text
(
Text
,
unpack
,
concat
)
import
Data.XML.Types
(
Event
)
import
Data.ByteString
(
ByteString
)
import
Data.Time.Segment
(
jour
)
import
Data.Time
(
UTCTime
(
..
))
import
qualified
Data.ByteString.Lazy
as
DBL
import
Gargantext.Text.Parsers.Wikimedia
issueXml
::
Maybe
[
PubMedArticle
]
issueXml
=
pubMedParser
pubMedData
data
PubMedArticle
=
PubMedArticle
{
pubmed_title
::
Maybe
Text
,
pubmed_journal
::
Maybe
Text
,
pubmed_abstract
::
Maybe
[
Text
]
,
pubmed_date
::
UTCTime
,
pubmed_year
::
Integer
,
pubmed_month
::
Int
,
pubmed_day
::
Int
}
deriving
(
Show
)
readPubMedFile
::
FilePath
->
IO
(
Maybe
[
PubMedArticle
]
)
readPubMedFile
::
FilePath
->
IO
()
readPubMedFile
fp
=
do
input
<-
DBL
.
readFile
fp
pure
$
pubMedParser
input
pubMedParser
::
DBL
.
ByteString
->
Maybe
[
PubMedArticle
]
pubMedParser
bstring
=
runConduit
$
parseLBS
def
bstring
.|
force
"Pubmed"
parseArticles
parseArticles
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
[
PubMedArticle
])
parseArticles
=
tagIgnoreAttrs
"PubmedArticleSet"
$
many
parseArticle
parseArticle
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
PubMedArticle
)
parseArticle
=
tagIgnoreAttrs
"PubmedArticle"
parseMedlineCitation
pubMedParser
input
pubMedParser
::
DBL
.
ByteString
->
IO
()
pubMedParser
bstring
=
runConduit
$
parseLBS
def
bstring
.|
parseArticleSet
.|
CL
.
mapM_
print
--parseArticleSet :: MonadThrow m => ConduitT Event o m [PubMedArticle]
parseArticleSet
=
do
as
<-
force
"force"
$
tagIgnoreAttrs
"PubmedArticleSet"
$
manyYield
parsePubMedArticle
-- _ <- many $ ignoreAnyTreeContent
return
as
parsePubMedArticle
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
PubMedArticle
)
parsePubMedArticle
=
do
articles
<-
force
"PubmedArticle"
$
tagIgnoreAttrs
"PubmedArticle"
parsePubMedArticle'
--
_
<-
many
$
ignoreAnyTreeContent
return
articles
parsePubMedArticle'
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
PubMedArticle
)
parsePubMedArticle'
=
do
pubmed_article
<-
tagIgnoreAttrs
"MedlineCitation"
parseMedlineCitation
--
_
<-
tagIgnoreAttrs
"PubmedData"
content
_
<-
many
$
ignoreAnyTreeContent
return
pubmed_article
parseMedlineCitation
::
MonadThrow
m
=>
ConduitT
Event
o
m
PubMedArticle
parseMedlineCitation
=
force
"medlineCitation"
$
tagIgnoreAttrs
"MedlineCitation"
$
do
_
<-
manyTagsUntil_
"Article"
journal
<-
tagIgnoreAttrs
"Journal"
$
force
"journal"
$
manyTagsUntil
"Title"
content
title
<-
manyTagsUntil
"ArticleTitle"
$
force
"title"
$
manyTagsUntil
"ArticleTitle"
content
parseMedlineCitation
=
do
a
<-
force
"article"
$
manyTagsUntil
"Article"
parseArticle
_
<-
many
$
ignoreAnyTreeContent
return
$
PubMedArticle
title
journal
return
a
parseArticle
::
MonadThrow
m
=>
ConduitT
Event
o
m
PubMedArticle
parseArticle
=
do
(
journal
,
maybePubDate
)
<-
force
"journal"
$
manyTagsUntil
"Journal"
$
do
maybePubDate'
<-
manyTagsUntil
"JournalIssue"
$
do
maybePubDate''
<-
manyTagsUntil
"PubDate"
$
do
y
<-
tagIgnoreAttrs
"Year"
content
m
<-
tagIgnoreAttrs
"Month"
content
d
<-
tagIgnoreAttrs
"Day"
content
return
(
y
,
m
,
d
)
return
maybePubDate''
j
<-
manyTagsUntil
"Title"
content
_
<-
many
$
ignoreAnyTreeContent
return
(
j
,
join
maybePubDate'
)
title
<-
do
t
<-
manyTagsUntil
"ArticleTitle"
content
return
t
abstracts
<-
do
as
<-
manyTagsUntil
"Abstract"
$
many
$
do
txt
<-
tagIgnoreAttrs
"AbstractText"
$
do
c
<-
content
_
<-
many
$
ignoreAnyTreeContent
return
c
_
<-
many
$
ignoreAnyTreeContent
return
txt
return
as
-- TODO add authos
(
year
,
month
,
day
)
<-
case
maybePubDate
of
Nothing
->
force
"ArticleDate"
$
manyTagsUntil
"ArticleDate"
$
do
y
<-
force
"Year"
$
tagIgnoreAttrs
"Year"
content
m
<-
force
"Month"
$
tagIgnoreAttrs
"Month"
content
d
<-
force
"Day"
$
tagIgnoreAttrs
"Day"
content
return
(
read
$
unpack
y
,
read
$
unpack
m
,
read
$
unpack
d
)
Just
(
Just
y
,
Just
m
,
Just
d
)
->
return
(
read
$
unpack
"1"
,
read
$
unpack
"3"
,
read
$
unpack
"3"
)
_
->
panic
"error date"
_
<-
many
$
ignoreAnyTreeContent
return
$
PubMedArticle
title
journal
abstracts
(
jour
year
month
day
)
year
month
day
pubMedData
::
DBL
.
ByteString
pubMedData
=
mconcat
[
"<?xml version=
\"
1.0
\"
?>"
,
"<!DOCTYPE PubmedArticleSet PUBLIC
\"
-//NLM//DTD PubMedArticle, 1st June 2018//EN
\"
\"
https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd
\"
>"
,
"<PubmedArticleSet>"
,
"<PubmedArticle>"
,
"<MedlineCitation Status=
\"
Publisher
\"
Owner=
\"
NLM
\"
>"
,
" <PMID Version=
\"
1
\"
>30357468</PMID>"
,
" <DateRevised>"
,
" <Year>2018</Year>"
,
" </DateRevised>"
,
" <Article PubModel=
\"
Print-Electronic
\"
>"
,
" <Journal>"
,
" <ISSN IssnType=
\"
Electronic
\"
>1432-1076</ISSN>"
,
" <Title>European journal of pediatrics</Title>"
,
" </Journal>"
,
" <ArticleTitle>European journal of pediatrics</ArticleTitle>"
,
" </Article>"
,
"</MedlineCitation>"
,
"</PubmedArticle>"
,
"</PubmedArticleSet>"
[
"<?xml version=
\"
1.0
\"
?>
\n
"
,
"<!DOCTYPE PubmedArticleSet PUBLIC
\"
-//NLM//DTD PubMedArticle, 1st June 2018//EN
\"
\"
https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd
\"
>
\n
"
,
"<PubmedArticleSet>
\n
"
,
"<PubmedArticle>
\n
"
,
" <MedlineCitation Status=
\"
Publisher
\"
Owner=
\"
NLM
\"
>
\n
"
,
" <PMID Version=
\"
1
\"
>30357468</PMID>
\n
"
,
" <DateRevised>
\n
"
,
" <Year>2018</Year>
\n
"
,
" </DateRevised>
\n
"
,
" <Article PubModel=
\"
Print-Electronic
\"
>
\n
"
,
" <Journal>
\n
"
,
" <ISSN IssnType=
\"
Electronic
\"
>1432-1076</ISSN>
\n
"
,
" <Title>European journal of pediatrics</Title>
\n
"
,
" </Journal>
\n
"
,
" <ArticleTitle>Title of the Article</ArticleTitle>
\n
"
,
" <ELocationID EIdType=
\"
doi
\"
ValidYN=
\"
Y
\"
>10.1007/s00431-018-3270-3</ELocationID>
\n
"
,
" <Abstract>
\n
"
,
" <AbstractText>Abstract Text.</AbstractText>
\n
"
,
" </Abstract>
\n
"
,
" <AuthorList>
\n
"
,
" </AuthorList>
\n
"
,
" </Article>
\n
"
,
" </MedlineCitation>
\n
"
,
" <PubmedData>
\n
"
,
" <History>
\n
"
,
" </History>
\n
"
,
" </PubmedData>
\n
"
,
"</PubmedArticle>
\n
"
,
"</PubmedArticleSet>
\n
"
]
src/Gargantext/Text/Parsers/Wikimedia.hs
View file @
681674f6
...
...
@@ -66,7 +66,7 @@ manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_
=
many_
.
ignoreTreeContent
.
tagUntil
manyTagsUntil_'
::
MonadThrow
m
=>
Name
->
ConduitT
Event
o
m
()
manyTagsUntil_'
=
many_
.
ignoreTag
.
tagUntil
manyTagsUntil_'
=
many_
.
ignore
Empty
Tag
.
tagUntil
-- | Utility function that parses nothing but the tag given,
-- usefull because we have to consume every data.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment