Commit c651ce24 authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge remote-tracking branch 'origin/lang-parser'

parents 9f5a6d1c f7791341
...@@ -52,6 +52,7 @@ library: ...@@ -52,6 +52,7 @@ library:
- Gargantext.Text.Metrics.Count - Gargantext.Text.Metrics.Count
- Gargantext.Text.Parsers.CSV - Gargantext.Text.Parsers.CSV
- Gargantext.Text.Parsers.Date - Gargantext.Text.Parsers.Date
- Gargantext.Text.Parsers.Wikimedia
- Gargantext.Text.Parsers.WOS - Gargantext.Text.Parsers.WOS
- Gargantext.Text.Search - Gargantext.Text.Search
- Gargantext.Text.Terms - Gargantext.Text.Terms
...@@ -111,6 +112,7 @@ library: ...@@ -111,6 +112,7 @@ library:
- mtl - mtl
- natural-transformation - natural-transformation
- opaleye - opaleye
- pandoc
- parsec - parsec
- path - path
- path-io - path-io
...@@ -155,6 +157,8 @@ library: ...@@ -155,6 +157,8 @@ library:
- wai-cors - wai-cors
- wai-extra - wai-extra
- warp - warp
- xml-conduit
- xml-types
- yaml - yaml
- zip - zip
- zlib - zlib
......
{-|
Module : Gargantext.Text.Parsers.WOS
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
@Gargantext.Text.Parsers.Wikimedia@:
This module provide a parser for wikipedia dump.
This include an xml parser for wikipedia's xml
and an wikimedia to plaintext converter for the wikipedia text field
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Parsers.Wikimedia where
import Gargantext.Prelude
import Text.XML.Stream.Parse
import Control.Monad.Catch
import Data.Conduit
import Data.XML.Types (Event, Name)
import Text.Pandoc
import Data.Text as T
import Data.Either
-- | Use case
-- >>> :{
-- wikimediaFile <- BL.readFile "text.xml"
-- _ <- runConduit $ parseLBS def wikimediaFile
-- .| force "mediawiki required" parseMediawiki
-- .| CL.mapM mediawikiPageToPlain
-- .| CL.mapM_ print
-- :}
-- | A simple "Page" type.
-- For the moment it take only text and title
-- (since there is no abstract) will see if other datas are relevant.
data Page = Page
{
_markupFormat :: MarkupFormat
, _title :: Maybe T.Text
, _text :: Maybe T.Text
}
deriving (Show)
data MarkupFormat = Mediawiki | Plaintext
deriving (Show)
parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
parseRevision =
tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
text <-
force "text is missing" $ ignoreExcept
"{http://www.mediawiki.org/xml/export-0.10/}text" content
many_
$ ignoreAnyTreeContent
return text
-- | Utility function that match everything but the tag given
tagUntil :: Name -> NameMatcher Name
tagUntil name = matching (/= name)
-- | Utility function that parse nothing but the tag given,
-- usefull because we have to consume every data.
ignoreExcept :: MonadThrow m => Name
-> ConduitT Event o m b
-> ConduitT Event o m (Maybe b)
ignoreExcept name f = do
_ <- consumeExcept name
tagIgnoreAttrs (matching (==name)) f
-- | Utility function that consume everything but the tag given
-- usefull because we have to consume every data.
consumeExcept :: MonadThrow m => Name -> ConduitT Event o m ()
consumeExcept = many_ . ignoreTreeContent . tagUntil
parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
parsePage =
tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
title <-
tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
_ <-
consumeExcept "{http://www.mediawiki.org/xml/export-0.10/}revision"
revision <-
parseRevision
many_ $ ignoreAnyTreeContent
return $ Page Mediawiki title revision
parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
parseMediawiki =
tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki"
$ manyYield' parsePage
-- | Convert a Mediawiki Page to a Plaintext Page.
-- Need to wrap the result in IO to parse and to combine it.
mediawikiPageToPlain :: Page -> IO Page
mediawikiPageToPlain page = do
title <- mediaToPlain $ _title page
revision <- mediaToPlain $ _text page
return $ Page Plaintext title revision
where mediaToPlain media =
case media of
(Nothing) -> return Nothing
(Just med) -> do
res <- runIO $ do
doc <- readMediaWiki def med
writePlain def doc
case res of
(Left _) -> return Nothing
(Right r) -> return $ Just r
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment