Commit 03ffdda9 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TEXT][PARSER][XML] Issue.

parent 5c8e2fc5
{-|
Module : Gargantext.Text.Parsers.PubMed
Description : Parser for Wikimedia dump
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
@Gargantext.Text.Parsers.Wikimedia@:
This module provide a parser for wikipedia dump.
This include an xml parser for wikipedia's xml
and an wikimedia to plaintext converter for the wikipedia text field
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Parsers.PubMed where
{-
import Data.Conduit
import Data.XML.Types (Event, Name)
import Text.Pandoc
import Data.Text as T
import Data.Either
-}
import Control.Monad (join)
import GHC.IO (FilePath)
import Prelude (read)
import Gargantext.Prelude
import Control.Applicative ((<*))
import Control.Monad.Catch (MonadThrow)
import Data.Maybe
import Data.Monoid (mconcat)
import Text.XML.Stream.Parse
import Data.Conduit (runConduit, (.|), ConduitT)
import Data.Text (Text, unpack)
import Data.XML.Types (Event)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as DBL
import Gargantext.Text.Parsers.Wikimedia
issueXml :: Maybe [PubMedArticle]
issueXml = pubMedParser pubMedData
data PubMedArticle =
PubMedArticle { pubmed_title :: Maybe Text
, pubmed_journal :: Maybe Text
}
deriving (Show)
readPubMedFile :: FilePath -> IO (Maybe [PubMedArticle])
readPubMedFile fp = do
input <- DBL.readFile fp
pure $ pubMedParser input
pubMedParser :: DBL.ByteString -> Maybe [PubMedArticle]
pubMedParser bstring = runConduit $ parseLBS def bstring .| force "Pubmed" parseArticles
parseArticles :: MonadThrow m => ConduitT Event o m (Maybe [PubMedArticle])
parseArticles = tagIgnoreAttrs "PubmedArticleSet" $ many parseArticle
parseArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMedArticle)
parseArticle = tagIgnoreAttrs "PubmedArticle" parseMedlineCitation
parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
parseMedlineCitation = force "medlineCitation" $ tagIgnoreAttrs "MedlineCitation" $ do
_ <- manyTagsUntil_ "Article"
journal <- tagIgnoreAttrs "Journal" $ force "journal" $ manyTagsUntil "Title" content
title <- manyTagsUntil "ArticleTitle" $ force "title" $ manyTagsUntil "ArticleTitle" content
_ <- many $ ignoreAnyTreeContent
return $ PubMedArticle title journal
pubMedData :: DBL.ByteString
pubMedData = mconcat
[ "<?xml version=\"1.0\"?>"
, "<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st June 2018//EN\" \"https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd\">"
, "<PubmedArticleSet>"
, "<PubmedArticle>"
, "<MedlineCitation Status=\"Publisher\" Owner=\"NLM\">"
, " <PMID Version=\"1\">30357468</PMID>"
, " <DateRevised>"
, " <Year>2018</Year>"
, " </DateRevised>"
, " <Article PubModel=\"Print-Electronic\">"
, " <Journal>"
, " <ISSN IssnType=\"Electronic\">1432-1076</ISSN>"
, " <Title>European journal of pediatrics</Title>"
, " </Journal>"
, " <ArticleTitle>European journal of pediatrics</ArticleTitle>"
, " </Article>"
, "</MedlineCitation>"
, "</PubmedArticle>"
, "</PubmedArticleSet>"
]
...@@ -16,15 +16,17 @@ and an wikimedia to plaintext converter for the wikipedia text field ...@@ -16,15 +16,17 @@ and an wikimedia to plaintext converter for the wikipedia text field
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Parsers.Wikimedia where module Gargantext.Text.Parsers.Wikimedia
import Gargantext.Prelude where
import Text.XML.Stream.Parse
import Control.Monad.Catch import Control.Monad.Catch
import Data.Conduit import Data.Conduit
import Data.Either
import Data.Text as T
import Data.XML.Types (Event, Name) import Data.XML.Types (Event, Name)
import Gargantext.Prelude
import Text.Pandoc import Text.Pandoc
import Data.Text as T import Text.XML.Stream.Parse
import Data.Either
-- | Use case -- | Use case
-- :{ -- :{
...@@ -38,52 +40,60 @@ import Data.Either ...@@ -38,52 +40,60 @@ import Data.Either
-- | A simple "Page" type. -- | A simple "Page" type.
-- For the moment it takes only text and title -- For the moment it takes only text and title
-- (since there is no abstract) will see if other data are relevant. -- (since there is no abstract) will see if other data are relevant.
data Page = Page data Page =
{ Page { _markupFormat :: MarkupFormat
_markupFormat :: MarkupFormat , _title :: Maybe T.Text
, _title :: Maybe T.Text , _text :: Maybe T.Text
, _text :: Maybe T.Text }
} deriving (Show)
deriving (Show)
data MarkupFormat = Mediawiki | Plaintext data MarkupFormat = Mediawiki | Plaintext
deriving (Show) deriving (Show)
parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text) parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
parseRevision = parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do text <- force "text is missing" $ ignoreExcept "{http://www.mediawiki.org/xml/export-0.10/}text" content
text <- many_ ignoreAnyTreeContent
force "text is missing" $ ignoreExcept
"{http://www.mediawiki.org/xml/export-0.10/}text" content
many_
$ ignoreAnyTreeContent
return text return text
-- | Utility function that match everything but the tag given -- | Utility function that matches everything but the tag given
tagUntil :: Name -> NameMatcher Name tagUntil :: Name -> NameMatcher Name
tagUntil name = matching (/= name) tagUntil name = matching (/= name)
-- | Utility function that parse nothing but the tag given, -- | Utility function that consumes everything but the tag given
-- usefull because we have to consume every data.
manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_ = many_ . ignoreTreeContent . tagUntil
manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_' = many_ . ignoreTag . tagUntil
-- | Utility function that parses nothing but the tag given,
-- usefull because we have to consume every data. -- usefull because we have to consume every data.
ignoreExcept :: MonadThrow m => Name ignoreExcept :: MonadThrow m => Name
-> ConduitT Event o m b -> ConduitT Event o m b
-> ConduitT Event o m (Maybe b) -> ConduitT Event o m (Maybe b)
ignoreExcept name f = do ignoreExcept name f = do
_ <- consumeExcept name _ <- manyTagsUntil_ name
tagIgnoreAttrs (matching (==name)) f tagIgnoreAttrs (matching (== name)) f
-- TODO: remove ignoreExcept to:
-- many ignoreAnyTreeContentUntil "Article"
manyTagsUntil :: MonadThrow m => Name
-> ConduitT Event o m b
-> ConduitT Event o m (Maybe b)
manyTagsUntil name f = do
_ <- manyTagsUntil_ name
tagIgnoreAttrs (matching (== name)) f
-- | Utility function that consume everything but the tag given
-- usefull because we have to consume every data.
consumeExcept :: MonadThrow m => Name -> ConduitT Event o m ()
consumeExcept = many_ . ignoreTreeContent . tagUntil
parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page) parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
parsePage = parsePage =
tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
title <- title <-
tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
_ <- _ <- manyTagsUntil_ "{http://www.mediawiki.org/xml/export-0.10/}revision"
consumeExcept "{http://www.mediawiki.org/xml/export-0.10/}revision"
revision <- revision <-
parseRevision parseRevision
many_ $ ignoreAnyTreeContent many_ $ ignoreAnyTreeContent
...@@ -109,5 +119,5 @@ mediawikiPageToPlain page = do ...@@ -109,5 +119,5 @@ mediawikiPageToPlain page = do
doc <- readMediaWiki def med doc <- readMediaWiki def med
writePlain def doc writePlain def doc
case res of case res of
(Left _) -> return Nothing (Left _) -> return Nothing
(Right r) -> return $ Just r (Right r) -> return $ Just r
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment