Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
haskell-gargantext
Commits
681674f6
Commit
681674f6
authored
Nov 12, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[TEXT][PARSER][PUBMED] PubDate or ArticleDate are not reliable.
parent
03ffdda9
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
118 additions
and
55 deletions
+118
-55
Parsers.hs
src/Gargantext/Text/Parsers.hs
+1
-1
PubMed.hs
src/Gargantext/Text/Parsers/PubMed.hs
+116
-53
Wikimedia.hs
src/Gargantext/Text/Parsers/Wikimedia.hs
+1
-1
No files found.
src/Gargantext/Text/Parsers.hs
View file @
681674f6
...
...
@@ -29,9 +29,9 @@ import System.FilePath (FilePath(), takeExtension)
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
Control.Monad
(
join
)
import
Data.Time
(
UTCTime
(
..
))
import
qualified
Data.Time
as
DT
import
Data.Either.Extra
(
partitionEithers
)
import
Data.Time
(
UTCTime
(
..
))
import
Data.List
(
concat
)
import
qualified
Data.Map
as
DM
import
qualified
Data.ByteString
as
DB
...
...
src/Gargantext/Text/Parsers/PubMed.hs
View file @
681674f6
...
...
@@ -7,10 +7,7 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
@Gargantext.Text.Parsers.Wikimedia@:
This module provide a parser for wikipedia dump.
This include an xml parser for wikipedia's xml
and an wikimedia to plaintext converter for the wikipedia text field
-}
{-# LANGUAGE OverloadedStrings #-}
...
...
@@ -18,85 +15,151 @@ and an wikimedia to plaintext converter for the wikipedia text field
module
Gargantext.Text.Parsers.PubMed
where
{-
import Data.Conduit
import Data.XML.Types (Event, Name)
import Text.Pandoc
import Data.Text as T
import Data.Either
-}
import
Control.Monad
(
void
)
import
Data.Conduit.List
as
CL
hiding
(
catMaybes
)
import
Control.Monad
(
join
)
import
GHC.IO
(
FilePath
)
import
Prelude
(
read
)
import
Prelude
(
read
,
print
)
import
Gargantext.Prelude
import
Control.Applicative
((
<*
))
import
Control.Monad.Catch
(
MonadThrow
)
import
Data.Maybe
import
Data.Maybe
(
Maybe
,
catMaybes
)
import
Data.Monoid
(
mconcat
)
import
Text.XML.Stream.Parse
import
Data.Conduit
(
runConduit
,
(
.|
),
ConduitT
)
import
Data.Text
(
Text
,
unpack
)
import
Data.Text
(
Text
,
unpack
,
concat
)
import
Data.XML.Types
(
Event
)
import
Data.ByteString
(
ByteString
)
import
Data.Time.Segment
(
jour
)
import
Data.Time
(
UTCTime
(
..
))
import
qualified
Data.ByteString.Lazy
as
DBL
import
Gargantext.Text.Parsers.Wikimedia
issueXml
::
Maybe
[
PubMedArticle
]
issueXml
=
pubMedParser
pubMedData
data
PubMedArticle
=
PubMedArticle
{
pubmed_title
::
Maybe
Text
,
pubmed_journal
::
Maybe
Text
,
pubmed_abstract
::
Maybe
[
Text
]
,
pubmed_date
::
UTCTime
,
pubmed_year
::
Integer
,
pubmed_month
::
Int
,
pubmed_day
::
Int
}
deriving
(
Show
)
readPubMedFile
::
FilePath
->
IO
(
Maybe
[
PubMedArticle
]
)
readPubMedFile
::
FilePath
->
IO
()
readPubMedFile
fp
=
do
input
<-
DBL
.
readFile
fp
pure
$
pubMedParser
input
pubMedParser
::
DBL
.
ByteString
->
Maybe
[
PubMedArticle
]
pubMedParser
bstring
=
runConduit
$
parseLBS
def
bstring
.|
force
"Pubmed"
parseArticles
parseArticles
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
[
PubMedArticle
])
parseArticles
=
tagIgnoreAttrs
"PubmedArticleSet"
$
many
parseArticle
parseArticle
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
PubMedArticle
)
parseArticle
=
tagIgnoreAttrs
"PubmedArticle"
parseMedlineCitation
pubMedParser
input
pubMedParser
::
DBL
.
ByteString
->
IO
()
pubMedParser
bstring
=
runConduit
$
parseLBS
def
bstring
.|
parseArticleSet
.|
CL
.
mapM_
print
--parseArticleSet :: MonadThrow m => ConduitT Event o m [PubMedArticle]
parseArticleSet
=
do
as
<-
force
"force"
$
tagIgnoreAttrs
"PubmedArticleSet"
$
manyYield
parsePubMedArticle
-- _ <- many $ ignoreAnyTreeContent
return
as
parsePubMedArticle
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
PubMedArticle
)
parsePubMedArticle
=
do
articles
<-
force
"PubmedArticle"
$
tagIgnoreAttrs
"PubmedArticle"
parsePubMedArticle'
--
_
<-
many
$
ignoreAnyTreeContent
return
articles
parsePubMedArticle'
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
PubMedArticle
)
parsePubMedArticle'
=
do
pubmed_article
<-
tagIgnoreAttrs
"MedlineCitation"
parseMedlineCitation
--
_
<-
tagIgnoreAttrs
"PubmedData"
content
_
<-
many
$
ignoreAnyTreeContent
return
pubmed_article
parseMedlineCitation
::
MonadThrow
m
=>
ConduitT
Event
o
m
PubMedArticle
parseMedlineCitation
=
force
"medlineCitation"
$
tagIgnoreAttrs
"MedlineCitation"
$
do
_
<-
manyTagsUntil_
"Article"
journal
<-
tagIgnoreAttrs
"Journal"
$
force
"journal"
$
manyTagsUntil
"Title"
content
title
<-
manyTagsUntil
"ArticleTitle"
$
force
"title"
$
manyTagsUntil
"ArticleTitle"
content
parseMedlineCitation
=
do
a
<-
force
"article"
$
manyTagsUntil
"Article"
parseArticle
_
<-
many
$
ignoreAnyTreeContent
return
$
PubMedArticle
title
journal
return
a
parseArticle
::
MonadThrow
m
=>
ConduitT
Event
o
m
PubMedArticle
parseArticle
=
do
(
journal
,
maybePubDate
)
<-
force
"journal"
$
manyTagsUntil
"Journal"
$
do
maybePubDate'
<-
manyTagsUntil
"JournalIssue"
$
do
maybePubDate''
<-
manyTagsUntil
"PubDate"
$
do
y
<-
tagIgnoreAttrs
"Year"
content
m
<-
tagIgnoreAttrs
"Month"
content
d
<-
tagIgnoreAttrs
"Day"
content
return
(
y
,
m
,
d
)
return
maybePubDate''
j
<-
manyTagsUntil
"Title"
content
_
<-
many
$
ignoreAnyTreeContent
return
(
j
,
join
maybePubDate'
)
title
<-
do
t
<-
manyTagsUntil
"ArticleTitle"
content
return
t
abstracts
<-
do
as
<-
manyTagsUntil
"Abstract"
$
many
$
do
txt
<-
tagIgnoreAttrs
"AbstractText"
$
do
c
<-
content
_
<-
many
$
ignoreAnyTreeContent
return
c
_
<-
many
$
ignoreAnyTreeContent
return
txt
return
as
-- TODO add authos
(
year
,
month
,
day
)
<-
case
maybePubDate
of
Nothing
->
force
"ArticleDate"
$
manyTagsUntil
"ArticleDate"
$
do
y
<-
force
"Year"
$
tagIgnoreAttrs
"Year"
content
m
<-
force
"Month"
$
tagIgnoreAttrs
"Month"
content
d
<-
force
"Day"
$
tagIgnoreAttrs
"Day"
content
return
(
read
$
unpack
y
,
read
$
unpack
m
,
read
$
unpack
d
)
Just
(
Just
y
,
Just
m
,
Just
d
)
->
return
(
read
$
unpack
"1"
,
read
$
unpack
"3"
,
read
$
unpack
"3"
)
_
->
panic
"error date"
_
<-
many
$
ignoreAnyTreeContent
return
$
PubMedArticle
title
journal
abstracts
(
jour
year
month
day
)
year
month
day
pubMedData
::
DBL
.
ByteString
pubMedData
=
mconcat
[
"<?xml version=
\"
1.0
\"
?>"
,
"<!DOCTYPE PubmedArticleSet PUBLIC
\"
-//NLM//DTD PubMedArticle, 1st June 2018//EN
\"
\"
https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd
\"
>"
,
"<PubmedArticleSet>"
,
"<PubmedArticle>"
,
"<MedlineCitation Status=
\"
Publisher
\"
Owner=
\"
NLM
\"
>"
,
" <PMID Version=
\"
1
\"
>30357468</PMID>"
,
" <DateRevised>"
,
" <Year>2018</Year>"
,
" </DateRevised>"
,
" <Article PubModel=
\"
Print-Electronic
\"
>"
,
" <Journal>"
,
" <ISSN IssnType=
\"
Electronic
\"
>1432-1076</ISSN>"
,
" <Title>European journal of pediatrics</Title>"
,
" </Journal>"
,
" <ArticleTitle>European journal of pediatrics</ArticleTitle>"
,
" </Article>"
,
"</MedlineCitation>"
,
"</PubmedArticle>"
,
"</PubmedArticleSet>"
[
"<?xml version=
\"
1.0
\"
?>
\n
"
,
"<!DOCTYPE PubmedArticleSet PUBLIC
\"
-//NLM//DTD PubMedArticle, 1st June 2018//EN
\"
\"
https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd
\"
>
\n
"
,
"<PubmedArticleSet>
\n
"
,
"<PubmedArticle>
\n
"
,
" <MedlineCitation Status=
\"
Publisher
\"
Owner=
\"
NLM
\"
>
\n
"
,
" <PMID Version=
\"
1
\"
>30357468</PMID>
\n
"
,
" <DateRevised>
\n
"
,
" <Year>2018</Year>
\n
"
,
" </DateRevised>
\n
"
,
" <Article PubModel=
\"
Print-Electronic
\"
>
\n
"
,
" <Journal>
\n
"
,
" <ISSN IssnType=
\"
Electronic
\"
>1432-1076</ISSN>
\n
"
,
" <Title>European journal of pediatrics</Title>
\n
"
,
" </Journal>
\n
"
,
" <ArticleTitle>Title of the Article</ArticleTitle>
\n
"
,
" <ELocationID EIdType=
\"
doi
\"
ValidYN=
\"
Y
\"
>10.1007/s00431-018-3270-3</ELocationID>
\n
"
,
" <Abstract>
\n
"
,
" <AbstractText>Abstract Text.</AbstractText>
\n
"
,
" </Abstract>
\n
"
,
" <AuthorList>
\n
"
,
" </AuthorList>
\n
"
,
" </Article>
\n
"
,
" </MedlineCitation>
\n
"
,
" <PubmedData>
\n
"
,
" <History>
\n
"
,
" </History>
\n
"
,
" </PubmedData>
\n
"
,
"</PubmedArticle>
\n
"
,
"</PubmedArticleSet>
\n
"
]
src/Gargantext/Text/Parsers/Wikimedia.hs
View file @
681674f6
...
...
@@ -66,7 +66,7 @@ manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_
=
many_
.
ignoreTreeContent
.
tagUntil
manyTagsUntil_'
::
MonadThrow
m
=>
Name
->
ConduitT
Event
o
m
()
manyTagsUntil_'
=
many_
.
ignoreTag
.
tagUntil
manyTagsUntil_'
=
many_
.
ignore
Empty
Tag
.
tagUntil
-- | Utility function that parses nothing but the tag given,
-- usefull because we have to consume every data.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment