Commit 54aa9ba5 authored by Mael NICOLAS's avatar Mael NICOLAS

formatting so no lines do more than 72 clomuns #4

parent 04133e20
{-|
Module : Gargantext.Text.Parsers.WOS
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
@Gargantext.Text.Parsers.Wikimedia@:
This module provide a parser for wikipedia dump.
This include an xml parser for wikipedia's xml
and an wikimedia to plaintext converter for the wikipedia text field
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
......@@ -9,16 +24,20 @@ import Data.Conduit
import Data.XML.Types (Event, Name)
import Text.Pandoc
import Data.Text as T
import Data.Either
-- | This module provide a parser for wikipedia dump.
-- This include an xml parser for wikipedia's xml and an wikimedia to plaintext converter for the wikipedia text field
-- | Use case
-- >>> :{
-- wikimediaFile <- BL.readFile "text.xml"
-- _ <- runConduit $ parseLBS def wikimediaFile .| force "mediawiki required" parseMediawiki .| CL.mapM mediawikiPageToPlain
-- _ <- runConduit $ parseLBS def wikimediaFile
-- .| force "mediawiki required" parseMediawiki
-- .| CL.mapM mediawikiPageToPlain
-- .| CL.mapM_ print
-- :}
-- | A simple "Page" type, for the moment it take only text and title (since there is no abstract) will see if other datas are relevant.
-- | A simple "Page" type.
-- For the moment it take only text and title
-- (since there is no abstract) will see if other datas are relevant.
data Page = Page
{
_title :: T.Text
......@@ -27,16 +46,23 @@ data Page = Page
deriving (Show)
parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
text <- force "text is missing" $ ignoreExcept "{http://www.mediawiki.org/xml/export-0.10/}text" content
many_ $ ignoreAnyTreeContent
parseRevision =
tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
text <-
force "text is missing" $ ignoreExcept
"{http://www.mediawiki.org/xml/export-0.10/}text" content
many_
$ ignoreAnyTreeContent
return text
tagUntil :: Name -> NameMatcher Name
tagUntil name = matching (/= name)
-- | Utility function that parse nothing but the tag given, usefull because we have to consume every data.
ignoreExcept :: MonadThrow m => Name -> ConduitT Event o m b -> ConduitT Event o m (Maybe b)
-- | Utility function that parse nothing but the tag given,
-- usefull because we have to consume every data.
ignoreExcept :: MonadThrow m => Name
-> ConduitT Event o m b
-> ConduitT Event o m (Maybe b)
ignoreExcept name f = do
_ <- consumeExcept name
tagIgnoreAttrs (matching (==name)) f
......@@ -45,15 +71,22 @@ consumeExcept :: MonadThrow m => Name -> ConduitT Event o m ()
consumeExcept = many_ . ignoreTreeContent . tagUntil
parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
parsePage = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
title <- force "title is missing" $ tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
_ <- consumeExcept "{http://www.mediawiki.org/xml/export-0.10/}revision"
revision <- force "revision is missing" $ parseRevision
parsePage =
tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
title <-
force "title is missing" $
tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
_ <-
consumeExcept "{http://www.mediawiki.org/xml/export-0.10/}revision"
revision <-
force "revision is missing" $ parseRevision
many_ $ ignoreAnyTreeContent
return $ Page title revision
parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
parseMediawiki = tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki" $ manyYield' parsePage
parseMediawiki =
tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki"
$ manyYield' parsePage
-- | Need to wrap the result in IO to parse and to combine it.
mediawikiPageToPlain :: Page -> IO Page
......@@ -65,4 +98,6 @@ mediawikiPageToPlain page = do
res <- runIO $ do
doc <- readMediaWiki def media
writePlain def doc
handleError res
case res of
(Left _) -> return media
(Right r) -> return r
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment