Commit 681674f6 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TEXT][PARSER][PUBMED] PubDate or ArticleDate are not reliable.

parent 03ffdda9
...@@ -29,9 +29,9 @@ import System.FilePath (FilePath(), takeExtension) ...@@ -29,9 +29,9 @@ import System.FilePath (FilePath(), takeExtension)
import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries) import "zip" Codec.Archive.Zip (withArchive, getEntry, getEntries)
import Control.Monad (join) import Control.Monad (join)
import Data.Time (UTCTime(..))
import qualified Data.Time as DT import qualified Data.Time as DT
import Data.Either.Extra (partitionEithers) import Data.Either.Extra (partitionEithers)
import Data.Time (UTCTime(..))
import Data.List (concat) import Data.List (concat)
import qualified Data.Map as DM import qualified Data.Map as DM
import qualified Data.ByteString as DB import qualified Data.ByteString as DB
......
...@@ -7,10 +7,7 @@ Maintainer : team@gargantext.org ...@@ -7,10 +7,7 @@ Maintainer : team@gargantext.org
Stability : experimental Stability : experimental
Portability : POSIX Portability : POSIX
@Gargantext.Text.Parsers.Wikimedia@:
This module provide a parser for wikipedia dump.
This include an xml parser for wikipedia's xml
and an wikimedia to plaintext converter for the wikipedia text field
-} -}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
...@@ -18,85 +15,151 @@ and an wikimedia to plaintext converter for the wikipedia text field ...@@ -18,85 +15,151 @@ and an wikimedia to plaintext converter for the wikipedia text field
module Gargantext.Text.Parsers.PubMed where module Gargantext.Text.Parsers.PubMed where
{-
import Data.Conduit
import Data.XML.Types (Event, Name)
import Text.Pandoc
import Data.Text as T
import Data.Either
-}
import Control.Monad (void)
import Data.Conduit.List as CL hiding (catMaybes)
import Control.Monad (join) import Control.Monad (join)
import GHC.IO (FilePath) import GHC.IO (FilePath)
import Prelude (read) import Prelude (read, print)
import Gargantext.Prelude import Gargantext.Prelude
import Control.Applicative ((<*)) import Control.Applicative ((<*))
import Control.Monad.Catch (MonadThrow) import Control.Monad.Catch (MonadThrow)
import Data.Maybe import Data.Maybe (Maybe, catMaybes)
import Data.Monoid (mconcat) import Data.Monoid (mconcat)
import Text.XML.Stream.Parse import Text.XML.Stream.Parse
import Data.Conduit (runConduit, (.|), ConduitT) import Data.Conduit (runConduit, (.|), ConduitT)
import Data.Text (Text, unpack) import Data.Text (Text, unpack, concat)
import Data.XML.Types (Event) import Data.XML.Types (Event)
import Data.ByteString (ByteString) import Data.ByteString (ByteString)
import Data.Time.Segment (jour)
import Data.Time (UTCTime(..))
import qualified Data.ByteString.Lazy as DBL import qualified Data.ByteString.Lazy as DBL
import Gargantext.Text.Parsers.Wikimedia import Gargantext.Text.Parsers.Wikimedia
issueXml :: Maybe [PubMedArticle]
issueXml = pubMedParser pubMedData
data PubMedArticle = data PubMedArticle =
PubMedArticle { pubmed_title :: Maybe Text PubMedArticle { pubmed_title :: Maybe Text
, pubmed_journal :: Maybe Text , pubmed_journal :: Maybe Text
, pubmed_abstract :: Maybe [Text]
, pubmed_date :: UTCTime
, pubmed_year :: Integer
, pubmed_month :: Int
, pubmed_day :: Int
} }
deriving (Show) deriving (Show)
readPubMedFile :: FilePath -> IO (Maybe [PubMedArticle]) readPubMedFile :: FilePath -> IO ()
readPubMedFile fp = do readPubMedFile fp = do
input <- DBL.readFile fp input <- DBL.readFile fp
pure $ pubMedParser input pubMedParser input
pubMedParser :: DBL.ByteString -> Maybe [PubMedArticle] pubMedParser :: DBL.ByteString -> IO ()
pubMedParser bstring = runConduit $ parseLBS def bstring .| force "Pubmed" parseArticles pubMedParser bstring = runConduit $ parseLBS def bstring
.| parseArticleSet
parseArticles :: MonadThrow m => ConduitT Event o m (Maybe [PubMedArticle]) .| CL.mapM_ print
parseArticles = tagIgnoreAttrs "PubmedArticleSet" $ many parseArticle
--parseArticleSet :: MonadThrow m => ConduitT Event o m [PubMedArticle]
parseArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMedArticle) parseArticleSet = do
parseArticle = tagIgnoreAttrs "PubmedArticle" parseMedlineCitation as <- force "force" $ tagIgnoreAttrs "PubmedArticleSet" $ manyYield parsePubMedArticle
-- _ <- many $ ignoreAnyTreeContent
return as
parsePubMedArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMedArticle)
parsePubMedArticle = do
articles <- force "PubmedArticle" $ tagIgnoreAttrs "PubmedArticle" parsePubMedArticle'
--_ <- many $ ignoreAnyTreeContent
return articles
parsePubMedArticle' :: MonadThrow m => ConduitT Event o m (Maybe PubMedArticle)
parsePubMedArticle' = do
pubmed_article <- tagIgnoreAttrs "MedlineCitation" parseMedlineCitation
--_ <- tagIgnoreAttrs "PubmedData" content
_ <- many $ ignoreAnyTreeContent
return pubmed_article
parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
parseMedlineCitation = force "medlineCitation" $ tagIgnoreAttrs "MedlineCitation" $ do parseMedlineCitation = do
_ <- manyTagsUntil_ "Article" a <- force "article" $ manyTagsUntil "Article" parseArticle
journal <- tagIgnoreAttrs "Journal" $ force "journal" $ manyTagsUntil "Title" content
title <- manyTagsUntil "ArticleTitle" $ force "title" $ manyTagsUntil "ArticleTitle" content
_ <- many $ ignoreAnyTreeContent _ <- many $ ignoreAnyTreeContent
return $ PubMedArticle title journal return a
parseArticle :: MonadThrow m => ConduitT Event o m PubMedArticle
parseArticle = do
(journal,maybePubDate) <- force "journal" $ manyTagsUntil "Journal" $ do
maybePubDate' <- manyTagsUntil "JournalIssue" $ do
maybePubDate'' <- manyTagsUntil "PubDate" $ do
y <- tagIgnoreAttrs "Year" content
m <- tagIgnoreAttrs "Month" content
d <- tagIgnoreAttrs "Day" content
return (y, m, d)
return maybePubDate''
j <- manyTagsUntil "Title" content
_ <- many $ ignoreAnyTreeContent
return (j,join maybePubDate')
title <- do
t <- manyTagsUntil "ArticleTitle" content
return t
abstracts <- do
as <- manyTagsUntil "Abstract" $ many $ do
txt <- tagIgnoreAttrs "AbstractText" $ do
c <- content
_ <- many $ ignoreAnyTreeContent
return c
_ <- many $ ignoreAnyTreeContent
return txt
return as
-- TODO add authos
(year, month, day) <- case maybePubDate of
Nothing -> force "ArticleDate" $ manyTagsUntil "ArticleDate" $ do
y <- force "Year" $ tagIgnoreAttrs "Year" content
m <- force "Month" $ tagIgnoreAttrs "Month" content
d <- force "Day" $ tagIgnoreAttrs "Day" content
return (read $ unpack y, read $ unpack m, read $ unpack d)
Just (Just y, Just m, Just d) -> return (read $ unpack "1", read $ unpack "3", read $ unpack "3")
_ -> panic "error date"
_ <- many $ ignoreAnyTreeContent
return $ PubMedArticle title journal abstracts (jour year month day) year month day
pubMedData :: DBL.ByteString pubMedData :: DBL.ByteString
pubMedData = mconcat pubMedData = mconcat
[ "<?xml version=\"1.0\"?>" [ "<?xml version=\"1.0\"?>\n"
, "<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st June 2018//EN\" \"https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd\">" , "<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st June 2018//EN\" \"https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd\">\n"
, "<PubmedArticleSet>" , "<PubmedArticleSet>\n"
, "<PubmedArticle>" , "<PubmedArticle>\n"
, "<MedlineCitation Status=\"Publisher\" Owner=\"NLM\">" , " <MedlineCitation Status=\"Publisher\" Owner=\"NLM\">\n"
, " <PMID Version=\"1\">30357468</PMID>" , " <PMID Version=\"1\">30357468</PMID>\n"
, " <DateRevised>" , " <DateRevised>\n"
, " <Year>2018</Year>" , " <Year>2018</Year>\n"
, " </DateRevised>" , " </DateRevised>\n"
, " <Article PubModel=\"Print-Electronic\">" , " <Article PubModel=\"Print-Electronic\">\n"
, " <Journal>" , " <Journal>\n"
, " <ISSN IssnType=\"Electronic\">1432-1076</ISSN>" , " <ISSN IssnType=\"Electronic\">1432-1076</ISSN>\n"
, " <Title>European journal of pediatrics</Title>" , " <Title>European journal of pediatrics</Title>\n"
, " </Journal>" , " </Journal>\n"
, " <ArticleTitle>European journal of pediatrics</ArticleTitle>" , " <ArticleTitle>Title of the Article</ArticleTitle>\n"
, " </Article>" , " <ELocationID EIdType=\"doi\" ValidYN=\"Y\">10.1007/s00431-018-3270-3</ELocationID>\n"
, "</MedlineCitation>" , " <Abstract>\n"
, "</PubmedArticle>" , " <AbstractText>Abstract Text.</AbstractText>\n"
, "</PubmedArticleSet>" , " </Abstract>\n"
, " <AuthorList>\n"
, " </AuthorList>\n"
, " </Article>\n"
, " </MedlineCitation>\n"
, " <PubmedData>\n"
, " <History>\n"
, " </History>\n"
, " </PubmedData>\n"
, "</PubmedArticle>\n"
, "</PubmedArticleSet>\n"
] ]
...@@ -66,7 +66,7 @@ manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m () ...@@ -66,7 +66,7 @@ manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_ = many_ . ignoreTreeContent . tagUntil manyTagsUntil_ = many_ . ignoreTreeContent . tagUntil
manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m () manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_' = many_ . ignoreTag . tagUntil manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
-- | Utility function that parses nothing but the tag given, -- | Utility function that parses nothing but the tag given,
-- usefull because we have to consume every data. -- usefull because we have to consume every data.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment