Commit a9b24132 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FIX] gitlab.com not accessible.

parent 8b7506c0
Pipeline #394 canceled with stage
{-|
Module : Gargantext.Text.Parsers.PubMed
Description : Parser for PubMed
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
This version of Parsers fixes the Date of publication in Gargantext
(V3) parser of PubMed. Indeed, we can not rely neither on Journal
Publication Date neither on Article publication date, which are
incomplete structurally but for its interpretation too. Then, to
simplify and uniformize data, date of publication of database insertion
is used.
TODO:
- Add authors
- factorize
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Parsers.PubMed where
import Data.Conduit.List as CL hiding (catMaybes, head)
import Control.Monad (join)
import GHC.IO (FilePath)
import Prelude (read)
import Gargantext.Prelude
import Control.Monad.Catch (MonadThrow)
import Data.Maybe (Maybe)
import Data.Monoid (mconcat)
import Text.XML.Stream.Parse
import Data.Conduit (runConduit, (.|), ConduitT)
import Data.Text (Text, unpack)
import Data.XML.Types (Event)
import Data.Time.Segment (jour)
import Data.Time (UTCTime(..))
import qualified Data.ByteString.Lazy as DBL
import Gargantext.Text.Parsers.Wikimedia
data PubMed =
PubMed { pubmed_article :: PubMedArticle
, pubmed_date :: PubMedData
} deriving Show
data PubMedArticle =
PubMedArticle { pubmed_title :: Maybe Text
, pubmed_journal :: Maybe Text
, pubmed_abstract :: Maybe [Text]
}
deriving (Show)
data PubMedData =
PubMedData { pubmedData_date :: UTCTime
, pubmedData_year :: Integer
, pubmedData_month :: Int
, pubmedData_day :: Int
} deriving (Show)
readPubMedFile :: FilePath -> IO [PubMed]
readPubMedFile fp = do
input <- DBL.readFile fp
pubMedParser input
pubMedParser :: DBL.ByteString -> IO [PubMed]
pubMedParser bstring = runConduit $ parseLBS def bstring
.| parseArticleSet
.| CL.consume
parseArticleSet :: MonadThrow m => ConduitT Event PubMed m ()
parseArticleSet = do
as <- force "force" $ tagIgnoreAttrs "PubmedArticleSet" $ manyYield parsePubMedArticle
return as
parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMed)
parsePubMedArticle = do
articles <- tagIgnoreAttrs "PubmedArticle" parsePubMedArticle'
return articles
parsePubMedArticle' :: MonadThrow m => ConduitT Event o m (PubMed)
parsePubMedArticle' = do
article <- force "MedlineCitation" $ tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
dates <- tagIgnoreAttrs "PubmedData" $ do
dates' <- tagIgnoreAttrs "History" $ many $ tagIgnoreAttrs "PubMedPubDate" $ do
y' <- force "Year" $ tagIgnoreAttrs "Year" content
m' <- force "Month" $ tagIgnoreAttrs "Month" content
d' <- force "Day" $ tagIgnoreAttrs "Day" content
_ <- many $ ignoreAnyTreeContent
return (read $ unpack y', read $ unpack m', read $ unpack d')
_ <- many $ ignoreAnyTreeContent
return dates'
_ <- many $ ignoreAnyTreeContent
let (y,m,d) = maybe (1,1,1) identity $ join $ fmap head $ reverse <$> join dates
return $ PubMed (article) (PubMedData (jour y m d) y m d)
parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
parseMedlineCitation = do
a <- force "article" $ manyTagsUntil "Article" parseArticle
_ <- many $ ignoreAnyTreeContent
return a
parseArticle :: MonadThrow m => ConduitT Event o m PubMedArticle
parseArticle = do
journal <- force "journal" $ manyTagsUntil "Journal" $ do
j <- manyTagsUntil "Title" content
_ <- many $ ignoreAnyTreeContent
return j
title <- do
t <- manyTagsUntil "ArticleTitle" content
return t
abstracts <- do
as <- manyTagsUntil "Abstract" $ many $ do
txt <- tagIgnoreAttrs "AbstractText" $ do
c <- content
_ <- many $ ignoreAnyTreeContent
return c
_ <- many $ ignoreAnyTreeContent
return txt
return as
-- TODO add authos
_ <- many $ ignoreAnyTreeContent
return $ PubMedArticle title journal abstracts
pubMedData :: DBL.ByteString
pubMedData = mconcat
[ "<?xml version=\"1.0\"?>\n"
, "<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st June 2018//EN\" \"https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd\">\n"
, "<PubmedArticleSet>\n"
, "<PubmedArticle>\n"
, " <MedlineCitation Status=\"Publisher\" Owner=\"NLM\">\n"
, " <PMID Version=\"1\">30357468</PMID>\n"
, " <DateRevised>\n"
, " <Year>2018</Year>\n"
, " </DateRevised>\n"
, " <Article PubModel=\"Print-Electronic\">\n"
, " <Journal>\n"
, " <ISSN IssnType=\"Electronic\">1432-1076</ISSN>\n"
, " <Title>European journal of pediatrics</Title>\n"
, " </Journal>\n"
, " <ArticleTitle>Title of the Article</ArticleTitle>\n"
, " <ELocationID EIdType=\"doi\" ValidYN=\"Y\">10.1007/s00431-018-3270-3</ELocationID>\n"
, " <Abstract>\n"
, " <AbstractText>Abstract Text.</AbstractText>\n"
, " </Abstract>\n"
, " <AuthorList>\n"
, " </AuthorList>\n"
, " </Article>\n"
, " </MedlineCitation>\n"
, " <PubmedData>\n"
, " <History>\n"
, " </History>\n"
, " </PubmedData>\n"
, "</PubmedArticle>\n"
, "</PubmedArticleSet>\n"
]
......@@ -6,7 +6,6 @@ packages:
- 'deps/servant-job'
- 'deps/clustering-louvain'
- 'deps/patches-map'
- 'deps/patches-class'
- 'deps/haskell-opaleye'
- 'deps/hsparql'
......@@ -27,7 +26,9 @@ extra-deps:
commit: 4fd2edf30c141600ffad6d730cc4c1c08a6dbce4
- git: https://gitlab.iscpif.fr/gargantext/crawlers/pubmed
commit: dcaa0f5dd53f20648f4f5a615d29163582a4219c
#- opaleye-0.6.7002.0
- git: https://gitlab.iscpif.fr/gargantext/patches-class
commit: 746b4ce0af8f9e600d555ad7e5b2973a940cdad9
#- opaleye-0.6.7002.0
- KMP-0.1.0.2
- accelerate-1.2.0.0
- aeson-lens-0.5.0.0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment