Commit 54aa9ba5 authored by Mael NICOLAS's avatar Mael NICOLAS

formatting so no lines do more than 72 clomuns #4

parent 04133e20
{-|
Module : Gargantext.Text.Parsers.WOS
Description :
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
@Gargantext.Text.Parsers.Wikimedia@:
This module provide a parser for wikipedia dump.
This include an xml parser for wikipedia's xml
and an wikimedia to plaintext converter for the wikipedia text field
-}
{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
...@@ -9,16 +24,20 @@ import Data.Conduit ...@@ -9,16 +24,20 @@ import Data.Conduit
import Data.XML.Types (Event, Name) import Data.XML.Types (Event, Name)
import Text.Pandoc import Text.Pandoc
import Data.Text as T import Data.Text as T
import Data.Either
-- | This module provide a parser for wikipedia dump.
-- This include an xml parser for wikipedia's xml and an wikimedia to plaintext converter for the wikipedia text field
-- | Use case -- | Use case
-- >>> :{ -- >>> :{
-- wikimediaFile <- BL.readFile "text.xml" -- wikimediaFile <- BL.readFile "text.xml"
-- _ <- runConduit $ parseLBS def wikimediaFile .| force "mediawiki required" parseMediawiki .| CL.mapM mediawikiPageToPlain -- _ <- runConduit $ parseLBS def wikimediaFile
-- .| force "mediawiki required" parseMediawiki
-- .| CL.mapM mediawikiPageToPlain
-- .| CL.mapM_ print
-- :} -- :}
-- | A simple "Page" type, for the moment it take only text and title (since there is no abstract) will see if other datas are relevant. -- | A simple "Page" type.
-- For the moment it take only text and title
-- (since there is no abstract) will see if other datas are relevant.
data Page = Page data Page = Page
{ {
_title :: T.Text _title :: T.Text
...@@ -27,16 +46,23 @@ data Page = Page ...@@ -27,16 +46,23 @@ data Page = Page
deriving (Show) deriving (Show)
parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text) parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do parseRevision =
text <- force "text is missing" $ ignoreExcept "{http://www.mediawiki.org/xml/export-0.10/}text" content tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
many_ $ ignoreAnyTreeContent text <-
force "text is missing" $ ignoreExcept
"{http://www.mediawiki.org/xml/export-0.10/}text" content
many_
$ ignoreAnyTreeContent
return text return text
tagUntil :: Name -> NameMatcher Name tagUntil :: Name -> NameMatcher Name
tagUntil name = matching (/= name) tagUntil name = matching (/= name)
-- | Utility function that parse nothing but the tag given, usefull because we have to consume every data. -- | Utility function that parse nothing but the tag given,
ignoreExcept :: MonadThrow m => Name -> ConduitT Event o m b -> ConduitT Event o m (Maybe b) -- usefull because we have to consume every data.
ignoreExcept :: MonadThrow m => Name
-> ConduitT Event o m b
-> ConduitT Event o m (Maybe b)
ignoreExcept name f = do ignoreExcept name f = do
_ <- consumeExcept name _ <- consumeExcept name
tagIgnoreAttrs (matching (==name)) f tagIgnoreAttrs (matching (==name)) f
...@@ -45,15 +71,22 @@ consumeExcept :: MonadThrow m => Name -> ConduitT Event o m () ...@@ -45,15 +71,22 @@ consumeExcept :: MonadThrow m => Name -> ConduitT Event o m ()
consumeExcept = many_ . ignoreTreeContent . tagUntil consumeExcept = many_ . ignoreTreeContent . tagUntil
parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page) parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
parsePage = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do parsePage =
title <- force "title is missing" $ tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
_ <- consumeExcept "{http://www.mediawiki.org/xml/export-0.10/}revision" title <-
revision <- force "revision is missing" $ parseRevision force "title is missing" $
tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
_ <-
consumeExcept "{http://www.mediawiki.org/xml/export-0.10/}revision"
revision <-
force "revision is missing" $ parseRevision
many_ $ ignoreAnyTreeContent many_ $ ignoreAnyTreeContent
return $ Page title revision return $ Page title revision
parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ()) parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
parseMediawiki = tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki" $ manyYield' parsePage parseMediawiki =
tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki"
$ manyYield' parsePage
-- | Need to wrap the result in IO to parse and to combine it. -- | Need to wrap the result in IO to parse and to combine it.
mediawikiPageToPlain :: Page -> IO Page mediawikiPageToPlain :: Page -> IO Page
...@@ -65,4 +98,6 @@ mediawikiPageToPlain page = do ...@@ -65,4 +98,6 @@ mediawikiPageToPlain page = do
res <- runIO $ do res <- runIO $ do
doc <- readMediaWiki def media doc <- readMediaWiki def media
writePlain def doc writePlain def doc
handleError res case res of
(Left _) -> return media
(Right r) -> return r
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment