Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
158
Issues
158
List
Board
Labels
Milestones
Merge Requests
9
Merge Requests
9
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
a9b24132
Commit
a9b24132
authored
May 17, 2019
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FIX] gitlab.com not accessible.
parent
8b7506c0
Pipeline
#394
canceled with stage
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
3 additions
and
169 deletions
+3
-169
PubMed.hs
src/Gargantext/Text/Parsers/PubMed.hs
+0
-167
stack.yaml
stack.yaml
+3
-2
No files found.
src/Gargantext/Text/Parsers/PubMed.hs
deleted
100644 → 0
View file @
8b7506c0
{-|
Module : Gargantext.Text.Parsers.PubMed
Description : Parser for PubMed
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This version of Parsers fixes the Date of publication in Gargantext
(V3) parser of PubMed. Indeed, we can not rely neither on Journal
Publication Date neither on Article publication date, which are
incomplete structurally but for its interpretation too. Then, to
simplify and uniformize data, date of publication of database insertion
is used.
TODO:
- Add authors
- factorize
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
module
Gargantext.Text.Parsers.PubMed
where
import
Data.Conduit.List
as
CL
hiding
(
catMaybes
,
head
)
import
Control.Monad
(
join
)
import
GHC.IO
(
FilePath
)
import
Prelude
(
read
)
import
Gargantext.Prelude
import
Control.Monad.Catch
(
MonadThrow
)
import
Data.Maybe
(
Maybe
)
import
Data.Monoid
(
mconcat
)
import
Text.XML.Stream.Parse
import
Data.Conduit
(
runConduit
,
(
.|
),
ConduitT
)
import
Data.Text
(
Text
,
unpack
)
import
Data.XML.Types
(
Event
)
import
Data.Time.Segment
(
jour
)
import
Data.Time
(
UTCTime
(
..
))
import
qualified
Data.ByteString.Lazy
as
DBL
import
Gargantext.Text.Parsers.Wikimedia
data
PubMed
=
PubMed
{
pubmed_article
::
PubMedArticle
,
pubmed_date
::
PubMedData
}
deriving
Show
data
PubMedArticle
=
PubMedArticle
{
pubmed_title
::
Maybe
Text
,
pubmed_journal
::
Maybe
Text
,
pubmed_abstract
::
Maybe
[
Text
]
}
deriving
(
Show
)
data
PubMedData
=
PubMedData
{
pubmedData_date
::
UTCTime
,
pubmedData_year
::
Integer
,
pubmedData_month
::
Int
,
pubmedData_day
::
Int
}
deriving
(
Show
)
readPubMedFile
::
FilePath
->
IO
[
PubMed
]
readPubMedFile
fp
=
do
input
<-
DBL
.
readFile
fp
pubMedParser
input
pubMedParser
::
DBL
.
ByteString
->
IO
[
PubMed
]
pubMedParser
bstring
=
runConduit
$
parseLBS
def
bstring
.|
parseArticleSet
.|
CL
.
consume
parseArticleSet
::
MonadThrow
m
=>
ConduitT
Event
PubMed
m
()
parseArticleSet
=
do
as
<-
force
"force"
$
tagIgnoreAttrs
"PubmedArticleSet"
$
manyYield
parsePubMedArticle
return
as
parsePubMedArticle
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
Maybe
PubMed
)
parsePubMedArticle
=
do
articles
<-
tagIgnoreAttrs
"PubmedArticle"
parsePubMedArticle'
return
articles
parsePubMedArticle'
::
MonadThrow
m
=>
ConduitT
Event
o
m
(
PubMed
)
parsePubMedArticle'
=
do
article
<-
force
"MedlineCitation"
$
tagIgnoreAttrs
"MedlineCitation"
parseMedlineCitation
dates
<-
tagIgnoreAttrs
"PubmedData"
$
do
dates'
<-
tagIgnoreAttrs
"History"
$
many
$
tagIgnoreAttrs
"PubMedPubDate"
$
do
y'
<-
force
"Year"
$
tagIgnoreAttrs
"Year"
content
m'
<-
force
"Month"
$
tagIgnoreAttrs
"Month"
content
d'
<-
force
"Day"
$
tagIgnoreAttrs
"Day"
content
_
<-
many
$
ignoreAnyTreeContent
return
(
read
$
unpack
y'
,
read
$
unpack
m'
,
read
$
unpack
d'
)
_
<-
many
$
ignoreAnyTreeContent
return
dates'
_
<-
many
$
ignoreAnyTreeContent
let
(
y
,
m
,
d
)
=
maybe
(
1
,
1
,
1
)
identity
$
join
$
fmap
head
$
reverse
<$>
join
dates
return
$
PubMed
(
article
)
(
PubMedData
(
jour
y
m
d
)
y
m
d
)
parseMedlineCitation
::
MonadThrow
m
=>
ConduitT
Event
o
m
PubMedArticle
parseMedlineCitation
=
do
a
<-
force
"article"
$
manyTagsUntil
"Article"
parseArticle
_
<-
many
$
ignoreAnyTreeContent
return
a
parseArticle
::
MonadThrow
m
=>
ConduitT
Event
o
m
PubMedArticle
parseArticle
=
do
journal
<-
force
"journal"
$
manyTagsUntil
"Journal"
$
do
j
<-
manyTagsUntil
"Title"
content
_
<-
many
$
ignoreAnyTreeContent
return
j
title
<-
do
t
<-
manyTagsUntil
"ArticleTitle"
content
return
t
abstracts
<-
do
as
<-
manyTagsUntil
"Abstract"
$
many
$
do
txt
<-
tagIgnoreAttrs
"AbstractText"
$
do
c
<-
content
_
<-
many
$
ignoreAnyTreeContent
return
c
_
<-
many
$
ignoreAnyTreeContent
return
txt
return
as
-- TODO add authos
_
<-
many
$
ignoreAnyTreeContent
return
$
PubMedArticle
title
journal
abstracts
pubMedData
::
DBL
.
ByteString
pubMedData
=
mconcat
[
"<?xml version=
\"
1.0
\"
?>
\n
"
,
"<!DOCTYPE PubmedArticleSet PUBLIC
\"
-//NLM//DTD PubMedArticle, 1st June 2018//EN
\"
\"
https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd
\"
>
\n
"
,
"<PubmedArticleSet>
\n
"
,
"<PubmedArticle>
\n
"
,
" <MedlineCitation Status=
\"
Publisher
\"
Owner=
\"
NLM
\"
>
\n
"
,
" <PMID Version=
\"
1
\"
>30357468</PMID>
\n
"
,
" <DateRevised>
\n
"
,
" <Year>2018</Year>
\n
"
,
" </DateRevised>
\n
"
,
" <Article PubModel=
\"
Print-Electronic
\"
>
\n
"
,
" <Journal>
\n
"
,
" <ISSN IssnType=
\"
Electronic
\"
>1432-1076</ISSN>
\n
"
,
" <Title>European journal of pediatrics</Title>
\n
"
,
" </Journal>
\n
"
,
" <ArticleTitle>Title of the Article</ArticleTitle>
\n
"
,
" <ELocationID EIdType=
\"
doi
\"
ValidYN=
\"
Y
\"
>10.1007/s00431-018-3270-3</ELocationID>
\n
"
,
" <Abstract>
\n
"
,
" <AbstractText>Abstract Text.</AbstractText>
\n
"
,
" </Abstract>
\n
"
,
" <AuthorList>
\n
"
,
" </AuthorList>
\n
"
,
" </Article>
\n
"
,
" </MedlineCitation>
\n
"
,
" <PubmedData>
\n
"
,
" <History>
\n
"
,
" </History>
\n
"
,
" </PubmedData>
\n
"
,
"</PubmedArticle>
\n
"
,
"</PubmedArticleSet>
\n
"
]
stack.yaml
View file @
a9b24132
...
...
@@ -6,7 +6,6 @@ packages:
-
'
deps/servant-job'
-
'
deps/clustering-louvain'
-
'
deps/patches-map'
-
'
deps/patches-class'
-
'
deps/haskell-opaleye'
-
'
deps/hsparql'
...
...
@@ -27,7 +26,9 @@ extra-deps:
commit
:
4fd2edf30c141600ffad6d730cc4c1c08a6dbce4
-
git
:
https://gitlab.iscpif.fr/gargantext/crawlers/pubmed
commit
:
dcaa0f5dd53f20648f4f5a615d29163582a4219c
#- opaleye-0.6.7002.0
-
git
:
https://gitlab.iscpif.fr/gargantext/patches-class
commit
:
746b4ce0af8f9e600d555ad7e5b2973a940cdad9
#- opaleye-0.6.7002.0
-
KMP-0.1.0.2
-
accelerate-1.2.0.0
-
aeson-lens-0.5.0.0
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment