Commit 03ffdda9 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TEXT][PARSER][XML] Issue.

parent 5c8e2fc5
{-|
Module : Gargantext.Text.Parsers.PubMed
Description : Parser for Wikimedia dump
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
@Gargantext.Text.Parsers.Wikimedia@:
This module provide a parser for wikipedia dump.
This include an xml parser for wikipedia's xml
and an wikimedia to plaintext converter for the wikipedia text field
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Parsers.PubMed where
{-
import Data.Conduit
import Data.XML.Types (Event, Name)
import Text.Pandoc
import Data.Text as T
import Data.Either
-}
import Control.Monad (join)
import GHC.IO (FilePath)
import Prelude (read)
import Gargantext.Prelude
import Control.Applicative ((<*))
import Control.Monad.Catch (MonadThrow)
import Data.Maybe
import Data.Monoid (mconcat)
import Text.XML.Stream.Parse
import Data.Conduit (runConduit, (.|), ConduitT)
import Data.Text (Text, unpack)
import Data.XML.Types (Event)
import Data.ByteString (ByteString)
import qualified Data.ByteString.Lazy as DBL
import Gargantext.Text.Parsers.Wikimedia
issueXml :: Maybe [PubMedArticle]
issueXml = pubMedParser pubMedData
data PubMedArticle =
PubMedArticle { pubmed_title :: Maybe Text
, pubmed_journal :: Maybe Text
}
deriving (Show)
readPubMedFile :: FilePath -> IO (Maybe [PubMedArticle])
readPubMedFile fp = do
input <- DBL.readFile fp
pure $ pubMedParser input
pubMedParser :: DBL.ByteString -> Maybe [PubMedArticle]
pubMedParser bstring = runConduit $ parseLBS def bstring .| force "Pubmed" parseArticles
parseArticles :: MonadThrow m => ConduitT Event o m (Maybe [PubMedArticle])
parseArticles = tagIgnoreAttrs "PubmedArticleSet" $ many parseArticle
parseArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMedArticle)
parseArticle = tagIgnoreAttrs "PubmedArticle" parseMedlineCitation
parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
parseMedlineCitation = force "medlineCitation" $ tagIgnoreAttrs "MedlineCitation" $ do
_ <- manyTagsUntil_ "Article"
journal <- tagIgnoreAttrs "Journal" $ force "journal" $ manyTagsUntil "Title" content
title <- manyTagsUntil "ArticleTitle" $ force "title" $ manyTagsUntil "ArticleTitle" content
_ <- many $ ignoreAnyTreeContent
return $ PubMedArticle title journal
pubMedData :: DBL.ByteString
pubMedData = mconcat
[ "<?xml version=\"1.0\"?>"
, "<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st June 2018//EN\" \"https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd\">"
, "<PubmedArticleSet>"
, "<PubmedArticle>"
, "<MedlineCitation Status=\"Publisher\" Owner=\"NLM\">"
, " <PMID Version=\"1\">30357468</PMID>"
, " <DateRevised>"
, " <Year>2018</Year>"
, " </DateRevised>"
, " <Article PubModel=\"Print-Electronic\">"
, " <Journal>"
, " <ISSN IssnType=\"Electronic\">1432-1076</ISSN>"
, " <Title>European journal of pediatrics</Title>"
, " </Journal>"
, " <ArticleTitle>European journal of pediatrics</ArticleTitle>"
, " </Article>"
, "</MedlineCitation>"
, "</PubmedArticle>"
, "</PubmedArticleSet>"
]
......@@ -16,15 +16,17 @@ and an wikimedia to plaintext converter for the wikipedia text field
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Parsers.Wikimedia where
import Gargantext.Prelude
import Text.XML.Stream.Parse
module Gargantext.Text.Parsers.Wikimedia
where
import Control.Monad.Catch
import Data.Conduit
import Data.Either
import Data.Text as T
import Data.XML.Types (Event, Name)
import Gargantext.Prelude
import Text.Pandoc
import Data.Text as T
import Data.Either
import Text.XML.Stream.Parse
-- | Use case
-- :{
......@@ -38,52 +40,60 @@ import Data.Either
-- | A simple "Page" type.
-- For the moment it takes only text and title
-- (since there is no abstract) will see if other data are relevant.
data Page = Page
{
_markupFormat :: MarkupFormat
, _title :: Maybe T.Text
, _text :: Maybe T.Text
}
deriving (Show)
data Page =
Page { _markupFormat :: MarkupFormat
, _title :: Maybe T.Text
, _text :: Maybe T.Text
}
deriving (Show)
data MarkupFormat = Mediawiki | Plaintext
deriving (Show)
parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
parseRevision =
tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
text <-
force "text is missing" $ ignoreExcept
"{http://www.mediawiki.org/xml/export-0.10/}text" content
many_
$ ignoreAnyTreeContent
parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
text <- force "text is missing" $ ignoreExcept "{http://www.mediawiki.org/xml/export-0.10/}text" content
many_ ignoreAnyTreeContent
return text
-- | Utility function that match everything but the tag given
-- | Utility function that matches everything but the tag given
tagUntil :: Name -> NameMatcher Name
tagUntil name = matching (/= name)
-- | Utility function that parse nothing but the tag given,
-- | Utility function that consumes everything but the tag given
-- usefull because we have to consume every data.
manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_ = many_ . ignoreTreeContent . tagUntil
manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_' = many_ . ignoreTag . tagUntil
-- | Utility function that parses nothing but the tag given,
-- usefull because we have to consume every data.
ignoreExcept :: MonadThrow m => Name
-> ConduitT Event o m b
-> ConduitT Event o m (Maybe b)
ignoreExcept name f = do
_ <- consumeExcept name
tagIgnoreAttrs (matching (==name)) f
_ <- manyTagsUntil_ name
tagIgnoreAttrs (matching (== name)) f
-- TODO: remove ignoreExcept to:
-- many ignoreAnyTreeContentUntil "Article"
manyTagsUntil :: MonadThrow m => Name
-> ConduitT Event o m b
-> ConduitT Event o m (Maybe b)
manyTagsUntil name f = do
_ <- manyTagsUntil_ name
tagIgnoreAttrs (matching (== name)) f
-- | Utility function that consume everything but the tag given
-- usefull because we have to consume every data.
consumeExcept :: MonadThrow m => Name -> ConduitT Event o m ()
consumeExcept = many_ . ignoreTreeContent . tagUntil
parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
parsePage =
tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
title <-
tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
_ <-
consumeExcept "{http://www.mediawiki.org/xml/export-0.10/}revision"
_ <- manyTagsUntil_ "{http://www.mediawiki.org/xml/export-0.10/}revision"
revision <-
parseRevision
many_ $ ignoreAnyTreeContent
......@@ -109,5 +119,5 @@ mediawikiPageToPlain page = do
doc <- readMediaWiki def med
writePlain def doc
case res of
(Left _) -> return Nothing
(Left _) -> return Nothing
(Right r) -> return $ Just r
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment