[TEXT][PARSER][XML] Issue.

03ffdda9 · Alexandre Delanoë · 5c8e2fc5 · 03ffdda9 · 03ffdda9
Commit 03ffdda9 authored Nov 12, 2018 by Alexandre Delanoë
Hide whitespace changes
Inline Side-by-side

Showing with 142 additions and 30 deletions

PubMed.hs src/Gargantext/Text/Parsers/PubMed.hs +102 -0

Wikimedia.hs src/Gargantext/Text/Parsers/Wikimedia.hs +40 -30

No files found.
--- a/src/Gargantext/Text/Parsers/PubMed.hs
+++ b/src/Gargantext/Text/Parsers/PubMed.hs
+{-|
+Module      : Gargantext.Text.Parsers.PubMed
+Description : Parser for Wikimedia dump
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+@Gargantext.Text.Parsers.Wikimedia@:
+This module provide a parser for wikipedia dump.
+This include an xml parser for wikipedia's xml
+and an wikimedia to plaintext converter for the wikipedia text field
+-}
+
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+
+module Gargantext.Text.Parsers.PubMed where
+
+{-
+import Data.Conduit
+import Data.XML.Types (Event, Name)
+import Text.Pandoc
+import Data.Text as T
+import Data.Either
+-}
+
+import Control.Monad (join)
+import GHC.IO (FilePath)
+import Prelude (read)
+import Gargantext.Prelude
+import Control.Applicative ((<*))
+import Control.Monad.Catch (MonadThrow)
+import Data.Maybe
+import Data.Monoid (mconcat)
+import Text.XML.Stream.Parse
+import Data.Conduit (runConduit, (.|), ConduitT)
+import Data.Text (Text, unpack)
+import Data.XML.Types (Event)
+import Data.ByteString (ByteString)
+import qualified Data.ByteString.Lazy as DBL
+import Gargantext.Text.Parsers.Wikimedia
+
+
+issueXml :: Maybe [PubMedArticle]
+issueXml = pubMedParser pubMedData
+
+data PubMedArticle =
+     PubMedArticle { pubmed_title   :: Maybe Text
+                   , pubmed_journal :: Maybe Text
+                   }
+     deriving (Show)
+
+readPubMedFile :: FilePath -> IO (Maybe [PubMedArticle])
+readPubMedFile fp = do
+  input <- DBL.readFile fp
+  pure $ pubMedParser input
+
+
+pubMedParser :: DBL.ByteString -> Maybe [PubMedArticle]
+pubMedParser bstring = runConduit $ parseLBS def bstring .| force "Pubmed" parseArticles
+
+parseArticles :: MonadThrow m => ConduitT Event o m (Maybe [PubMedArticle])
+parseArticles = tagIgnoreAttrs "PubmedArticleSet" $ many parseArticle
+
+parseArticle :: MonadThrow m => ConduitT Event o m (Maybe PubMedArticle)
+parseArticle = tagIgnoreAttrs "PubmedArticle" parseMedlineCitation
+
+parseMedlineCitation :: MonadThrow m => ConduitT Event o m PubMedArticle
+parseMedlineCitation = force "medlineCitation" $ tagIgnoreAttrs "MedlineCitation" $ do
+  _ <- manyTagsUntil_ "Article"
+  journal <- tagIgnoreAttrs "Journal"     $ force "journal" $ manyTagsUntil "Title"     content
+  title   <- manyTagsUntil "ArticleTitle" $ force "title"   $ manyTagsUntil "ArticleTitle" content
+  _ <- many $ ignoreAnyTreeContent
+  return $ PubMedArticle title journal
+
+
+pubMedData :: DBL.ByteString
+pubMedData = mconcat
+  [ "<?xml version=\"1.0\"?>"
+  , "<!DOCTYPE PubmedArticleSet PUBLIC \"-//NLM//DTD PubMedArticle, 1st June 2018//EN\" \"https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_180601.dtd\">"
+  , "<PubmedArticleSet>"
+  , "<PubmedArticle>"
+  , "<MedlineCitation Status=\"Publisher\" Owner=\"NLM\">"
+  , "        <PMID Version=\"1\">30357468</PMID>"
+  , "        <DateRevised>"
+  , "           <Year>2018</Year>"
+  , "        </DateRevised>"
+  , "        <Article PubModel=\"Print-Electronic\">"
+  , "          <Journal>"
+  , "            <ISSN IssnType=\"Electronic\">1432-1076</ISSN>"
+  , "            <Title>European journal of pediatrics</Title>"
+  , "          </Journal>"
+  , "          <ArticleTitle>European journal of pediatrics</ArticleTitle>"
+  , "        </Article>"
+  , "</MedlineCitation>"
+  , "</PubmedArticle>"
+  , "</PubmedArticleSet>"
+  ]
+
+
--- a/src/Gargantext/Text/Parsers/Wikimedia.hs
+++ b/src/Gargantext/Text/Parsers/Wikimedia.hs
@@ -16,15 +16,17 @@ and an wikimedia to plaintext converter for the wikipedia text field
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE NoImplicitPrelude #-}

-module Gargantext.Text.Parsers.Wikimedia where
-import Gargantext.Prelude
-import Text.XML.Stream.Parse
+module Gargantext.Text.Parsers.Wikimedia
+  where
+
 import Control.Monad.Catch
 import Data.Conduit
+import Data.Either
+import Data.Text as T
 import Data.XML.Types (Event, Name)
+import Gargantext.Prelude
 import Text.Pandoc
-import Data.Text as T
-import Data.Either
+import Text.XML.Stream.Parse

 -- | Use case
 -- :{
@@ -38,52 +40,60 @@ import Data.Either
 -- | A simple "Page" type.
 -- For the moment it takes only text and title
 --  (since there is no abstract) will see if other data are relevant.
-data Page = Page
-  {
-    _markupFormat :: MarkupFormat
-  , _title :: Maybe T.Text
-  , _text :: Maybe T.Text
-  }
-  deriving (Show)
+data Page =
+     Page { _markupFormat :: MarkupFormat
+          , _title        :: Maybe T.Text
+          , _text         :: Maybe T.Text
+          }
+          deriving (Show)

 data MarkupFormat = Mediawiki | Plaintext
  deriving (Show)

 parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
-parseRevision =
-  tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
-  text <-
-    force "text is missing" $ ignoreExcept
-    "{http://www.mediawiki.org/xml/export-0.10/}text" content
-  many_
-    $ ignoreAnyTreeContent
+parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
+  text <- force "text is missing" $ ignoreExcept "{http://www.mediawiki.org/xml/export-0.10/}text" content
+  many_ ignoreAnyTreeContent
  return text

-- | Utility function that match everything but the tag given
+-- | Utility function that matches everything but the tag given
 tagUntil :: Name -> NameMatcher Name
 tagUntil name = matching (/= name)

-- | Utility function that parse nothing but the tag given,
+-- | Utility function that consumes everything but the tag given
+-- usefull because we have to consume every data.
+manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
+manyTagsUntil_ = many_ . ignoreTreeContent . tagUntil
+
+manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
+manyTagsUntil_' = many_ . ignoreTag . tagUntil
+
+-- | Utility function that parses nothing but the tag given,
 -- usefull because we have to consume every data.
 ignoreExcept :: MonadThrow m => Name
  -> ConduitT Event o m b
  -> ConduitT Event o m (Maybe b)
 ignoreExcept name f = do
-  _ <- consumeExcept name
-  tagIgnoreAttrs (matching (==name)) f
+  _ <- manyTagsUntil_ name
+  tagIgnoreAttrs (matching (== name)) f
+
+-- TODO: remove ignoreExcept to:
+-- many ignoreAnyTreeContentUntil "Article"
+manyTagsUntil :: MonadThrow m => Name
+  -> ConduitT Event o m b
+  -> ConduitT Event o m (Maybe b)
+manyTagsUntil name f = do
+  _ <- manyTagsUntil_ name
+  tagIgnoreAttrs (matching (== name)) f
+

-- | Utility function that consume everything but the tag given
-- usefull because we have to consume every data.
-consumeExcept :: MonadThrow m => Name -> ConduitT Event o m ()
-consumeExcept = many_ . ignoreTreeContent . tagUntil

 parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
 parsePage =
  tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
  title <-
    tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
-  _ <-
-    consumeExcept "{http://www.mediawiki.org/xml/export-0.10/}revision"
+  _ <- manyTagsUntil_ "{http://www.mediawiki.org/xml/export-0.10/}revision"
  revision <-
    parseRevision
  many_ $ ignoreAnyTreeContent
@@ -109,5 +119,5 @@ mediawikiPageToPlain page = do
                doc <- readMediaWiki def med
                writePlain def doc
              case res of
-                (Left _) -> return Nothing
+                (Left _)  -> return Nothing
                (Right r) -> return $ Just r