1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
{-|
Module : Gargantext.Text.Parsers.PubMed
Description : Parser for PubMed
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This version of Parsers fixes the Date of publication in Gargantext
(V3) parser of PubMed. Indeed, we can not rely neither on Journal
Publication Date neither on Article publication date, which are
incomplete structurally but for its interpretation too. Then, to
simplify and uniformize data, date of publication of database insertion
is used.
TODO:
- Add authors
- factorize
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Parsers.PubMed where
import Data.Conduit.List as CL hiding (catMaybes, head)
import Control.Monad (join)
import GHC.IO (FilePath)
import Prelude (read)
import Gargantext.Prelude
import Control.Monad.Catch (MonadThrow)
import Data.Maybe (Maybe)
import Data.Monoid (mconcat)
import Text.XML.Stream.Parse
import Data.Conduit (runConduit, (.|), ConduitT)
import Data.Text (Text, unpack)
import Data.XML.Types (Event)
import Data.Time.Segment (jour)
import Data.Time (UTCTime(..))
import qualified Data.ByteString.Lazy as DBL
import Gargantext.Text.Parsers.Wikimedia
data PubMed =
PubMed { pubmed_article :: PubMedArticle
, pubmed_date :: PubMedData
} deriving Show
data PubMedArticle =
PubMedArticle { pubmed_title :: Maybe Text
, pubmed_journal :: Maybe Text
, pubmed_abstract :: Maybe [Text]
}
deriving (Show)
data PubMedData =
PubMedData { pubmedData_date :: UTCTime
, pubmedData_year :: Integer
, pubmedData_month :: Int
, pubmedData_day :: Int
} deriving (Show)
readPubMedFile :: FilePath -> IO [PubMed]
readPubMedFile fp = do
input <- DBL.readFile fp
pubMedParser input
pubMedParser :: DBL.ByteString -> IO [PubMed]
pubMedParser bstring = runConduit $ parseLBS def bstring
.| parseArticleSet
.| CL.consume
parseArticleSet :: MonadThrow m => ConduitT Event PubMed m ()
parseArticleSet = do
as <- force "force" $ tagIgnoreAttrs "PubmedArticleSet" $ manyYield parsePubMedArticle
return as
parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMed)
parsePubMedArticle = do
articles <- tagIgnoreAttrs "PubmedArticle" parsePubMedArticle'
return articles
parsePubMedArticle' :: MonadThrow m => ConduitT Event o m (PubMed)
parsePubMedArticle' = do
article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
dates <- tagIgnoreAttrs "PubmedData" $ do
dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do
y' <- force "Year" $ tagIgnoreAttrs "Year" content
m' <- force "Month" $ tagIgnoreAttrs "Month" content
d' <- force "Day" $ tagIgnoreAttrs "Day" content
_ <- many $ ignoreAnyTreeContent
return (read $ unpack y', read $ unpack m', read $ unpack d')
_ <- many $ ignoreAnyTreeContent
return dates'
_ <- many $ ignoreAnyTreeContent
let (y,m,d) = maybe (1,1,1) identity $ join $ fmap head $ reverse <$> join dates
return $ PubMed (article) (PubMedData (jour y m d) y m d)
parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
parseMedlineCitation = do
a <- force "article" $ manyTagsUntil "Article" parseArticle
_ <- many $ ignoreAnyTreeContent
return a
parseArticle :: MonadThrow m => ConduitT Event o m PubMedArticle
parseArticle = do
journal <- force "journal" $ manyTagsUntil "Journal" $ do
j <- manyTagsUntil "Title" content
_ <- many $ ignoreAnyTreeContent
return j
title <- do
t <- manyTagsUntil "ArticleTitle" content
return t
abstracts <- do
as <- manyTagsUntil "Abstract" $ many $ do
txt <- tagIgnoreAttrs "AbstractText" $ do
c <- content
_ <- many $ ignoreAnyTreeContent
return c
_ <- many $ ignoreAnyTreeContent
return txt
return as
-- TODO add authos
_ <- many $ ignoreAnyTreeContent
return $ PubMedArticle title journal abstracts
pubMedData :: DBL.ByteString
pubMedData = mconcat
[ "<?xml version=\"1.0\"?>\n"
, "<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st June 2018//EN\" \"https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd\">\n"
, "<PubmedArticleSet>\n"
, "<PubmedArticle>\n"
, " <MedlineCitation Status=\"Publisher\" Owner=\"NLM\">\n"
, " <PMID Version=\"1\">30357468</PMID>\n"
, " <DateRevised>\n"
, " <Year>2018</Year>\n"
, " </DateRevised>\n"
, " <Article PubModel=\"Print-Electronic\">\n"
, " <Journal>\n"
, " <ISSN IssnType=\"Electronic\">1432-1076</ISSN>\n"
, " <Title>European journal of pediatrics</Title>\n"
, " </Journal>\n"
, " <ArticleTitle>Title of the Article</ArticleTitle>\n"
, " <ELocationID EIdType=\"doi\" ValidYN=\"Y\">10.1007/s00431-018-3270-3</ELocationID>\n"
, " <Abstract>\n"
, " <AbstractText>Abstract Text.</AbstractText>\n"
, " </Abstract>\n"
, " <AuthorList>\n"
, " </AuthorList>\n"
, " </Article>\n"
, " </MedlineCitation>\n"
, " <PubmedData>\n"
, " <History>\n"
, " </History>\n"
, " </PubmedData>\n"
, "</PubmedArticle>\n"
, "</PubmedArticleSet>\n"
]