Merge remote-tracking branch 'origin/lang-parser'

c651ce24 · Alexandre Delanoë · 9f5a6d1c · f7791341 · c651ce24 · c651ce24
Commit c651ce24 authored Oct 09, 2018 by Alexandre Delanoë
Hide whitespace changes
Inline Side-by-side

Showing with 117 additions and 0 deletions

package.yaml package.yaml +4 -0

Wikimedia.hs src/Gargantext/Text/Parsers/Wikimedia.hs +113 -0

No files found.
--- a/package.yaml
+++ b/package.yaml
@@ -52,6 +52,7 @@ library:
  - Gargantext.Text.Metrics.Count
  - Gargantext.Text.Parsers.CSV
  - Gargantext.Text.Parsers.Date
+  - Gargantext.Text.Parsers.Wikimedia
  - Gargantext.Text.Parsers.WOS
  - Gargantext.Text.Search
  - Gargantext.Text.Terms
@@ -111,6 +112,7 @@ library:
  - mtl
  - natural-transformation
  - opaleye
+  - pandoc
  - parsec
  - path
  - path-io
@@ -155,6 +157,8 @@ library:
  - wai-cors
  - wai-extra
  - warp
+  - xml-conduit
+  - xml-types
  - yaml
  - zip
  - zlib

--- a/src/Gargantext/Text/Parsers/Wikimedia.hs
+++ b/src/Gargantext/Text/Parsers/Wikimedia.hs
+{-|
+Module      : Gargantext.Text.Parsers.WOS
+Description : 
+Copyright   : (c) CNRS, 2017-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+@Gargantext.Text.Parsers.Wikimedia@:
+This module provide a parser for wikipedia dump.
+This include an xml parser for wikipedia's xml
+and an wikimedia to plaintext converter for the wikipedia text field
+-}
+
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE NoImplicitPrelude #-}
+
+module Gargantext.Text.Parsers.Wikimedia where
+import Gargantext.Prelude
+import Text.XML.Stream.Parse
+import Control.Monad.Catch
+import Data.Conduit
+import Data.XML.Types (Event, Name)
+import Text.Pandoc
+import Data.Text as T
+import Data.Either
+
+-- | Use case
+-- >>> :{
+--  wikimediaFile <- BL.readFile "text.xml"
+--  _ <- runConduit $ parseLBS def wikimediaFile
+--        .| force "mediawiki required" parseMediawiki
+--        .| CL.mapM mediawikiPageToPlain
+--        .| CL.mapM_ print
+-- :}
+
+-- | A simple "Page" type.
+-- For the moment it take only text and title
+--  (since there is no abstract) will see if other datas are relevant.
+data Page = Page
+  {
+    _markupFormat :: MarkupFormat
+  , _title :: Maybe T.Text
+  , _text :: Maybe T.Text
+  }
+  deriving (Show)
+
+data MarkupFormat = Mediawiki | Plaintext
+  deriving (Show)
+
+parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
+parseRevision =
+  tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
+  text <-
+    force "text is missing" $ ignoreExcept
+    "{http://www.mediawiki.org/xml/export-0.10/}text" content
+  many_
+    $ ignoreAnyTreeContent
+  return text
+
+-- | Utility function that match everything but the tag given
+tagUntil :: Name -> NameMatcher Name
+tagUntil name = matching (/= name)
+
+-- | Utility function that parse nothing but the tag given,
+-- usefull because we have to consume every data.
+ignoreExcept :: MonadThrow m => Name
+  -> ConduitT Event o m b
+  -> ConduitT Event o m (Maybe b)
+ignoreExcept name f = do
+  _ <- consumeExcept name
+  tagIgnoreAttrs (matching (==name)) f
+
+-- | Utility function that consume everything but the tag given
+-- usefull because we have to consume every data.
+consumeExcept :: MonadThrow m => Name -> ConduitT Event o m ()
+consumeExcept = many_ . ignoreTreeContent . tagUntil
+
+parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
+parsePage =
+  tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
+  title <-
+    tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
+  _ <-
+    consumeExcept "{http://www.mediawiki.org/xml/export-0.10/}revision"
+  revision <-
+    parseRevision
+  many_ $ ignoreAnyTreeContent
+  return $ Page Mediawiki title revision
+
+parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
+parseMediawiki =
+  tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki"
+  $ manyYield' parsePage
+
+-- | Convert a Mediawiki Page to a Plaintext Page.
+-- Need to wrap the result in IO to parse and to combine it.
+mediawikiPageToPlain :: Page -> IO Page
+mediawikiPageToPlain page = do
+  title <- mediaToPlain $ _title page
+  revision <- mediaToPlain $ _text page
+  return $ Page Plaintext title revision
+  where mediaToPlain media =
+          case media of
+            (Nothing) -> return Nothing
+            (Just med) -> do
+              res <- runIO $ do
+                doc <- readMediaWiki def med
+                writePlain def doc
+              case res of
+                (Left _) -> return Nothing
+                (Right r) -> return $ Just r