Parse and convert text and title of wikipedia dump, #4

c341cb5d · Mael NICOLAS · 8f2332b3 · c341cb5d · c341cb5d
Commit c341cb5d authored Oct 04, 2018 by Mael NICOLAS
Hide whitespace changes
Inline Side-by-side

Showing with 39 additions and 12 deletions

package.yaml package.yaml +1 -0

Wikimedia.hs src/Gargantext/Text/Parsers/Wikimedia.hs +38 -12

No files found.
--- a/package.yaml
+++ b/package.yaml
@@ -108,6 +108,7 @@ library:
  - mtl
  - natural-transformation
  - opaleye
+  - pandoc
  - parsec
  - path
  - path-io

--- a/src/Gargantext/Text/Parsers/Wikimedia.hs
+++ b/src/Gargantext/Text/Parsers/Wikimedia.hs
@@ -2,41 +2,67 @@
 {-# LANGUAGE NoImplicitPrelude #-}

 module Gargantext.Text.Parsers.Wikimedia where
-import Prelude (print)
 import Gargantext.Prelude
 import Text.XML.Stream.Parse
 import Control.Monad.Catch
-import Data.ByteString.Lazy
 import Data.Conduit
-import Data.XML.Types (Event)
+import Data.XML.Types (Event, Name)
+import Text.Pandoc
 import Data.Text as T

+-- | This module provide a parser for wikipedia dump.
+-- This include an xml parser for wikipedia's xml and an wikimedia to plaintext converter for the wikipedia text field
+-- | Use case
+-- >>> :{
+--  wikimediaFile <- BL.readFile "text.xml"
+--  _ <- runConduit $ parseLBS def wikimediaFile .| force "mediawiki required" parseMediawiki .| CL.mapM_ print
+-- :}
+
+-- | A simple "Page" type, for the moment it take only text and title (since there is no abstract) will see if other datas are relevant.
 data Page = Page
  {
    _title :: T.Text
-  , _text :: Maybe T.Text
+  , _text :: T.Text
  }
  deriving (Show)

-runParser :: IO ()
-runParser = do
-  file <- readFile "text.xml"
-  page <- runConduit $  parseLBS def file .| force "page required" parsePage
-  print page
-
 parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
 parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
-  text <- force "text is missing" $ tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}text" content
+  text <- force "text is missing" $ ignoreExcept "{http://www.mediawiki.org/xml/export-0.10/}text" content
  many_ $ ignoreAnyTreeContent
  return text

+tagUntil :: Name -> NameMatcher Name
+tagUntil name = matching (/= name)
+
+-- | Utility function that parse nothing but the tag given, usefull because we have to consume every data.
+ignoreExcept :: MonadThrow m => Name -> ConduitT Event o m b -> ConduitT Event o m (Maybe b)
+ignoreExcept name f = do
+  _ <- consumeExcept name
+  tagIgnoreAttrs (matching (==name)) f
+
+consumeExcept :: MonadThrow m => Name -> ConduitT Event o m ()
+consumeExcept = many_ . ignoreTreeContent . tagUntil
+
 parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
 parsePage = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
  title <- force "title is missing" $ tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
-  revision <- parseRevision
+  _ <- consumeExcept "{http://www.mediawiki.org/xml/export-0.10/}revision"
+  revision <- force "revision is missing" $ parseRevision
  many_ $ ignoreAnyTreeContent
  return $ Page title revision

 parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
 parseMediawiki = tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki" $ manyYield' parsePage

+-- | Need to wrap the result in IO to parse and to combine it.
+mediawikiPageToPlain :: Page -> IO Page
+mediawikiPageToPlain page = do
+  title <- mediaToPlain $ _title page
+  revision <- mediaToPlain $ _text page
+  return $ Page title revision
+  where mediaToPlain media = do
+          res <- runIO $ do
+            doc <- readMediaWiki def media
+            writePlain def doc
+          handleError res