Commit c341cb5d authored by Mael NICOLAS's avatar Mael NICOLAS

Parse and convert text and title of wikipedia dump, #4

parent 8f2332b3
...@@ -108,6 +108,7 @@ library: ...@@ -108,6 +108,7 @@ library:
- mtl - mtl
- natural-transformation - natural-transformation
- opaleye - opaleye
- pandoc
- parsec - parsec
- path - path
- path-io - path-io
......
...@@ -2,41 +2,67 @@ ...@@ -2,41 +2,67 @@
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Parsers.Wikimedia where module Gargantext.Text.Parsers.Wikimedia where
import Prelude (print)
import Gargantext.Prelude import Gargantext.Prelude
import Text.XML.Stream.Parse import Text.XML.Stream.Parse
import Control.Monad.Catch import Control.Monad.Catch
import Data.ByteString.Lazy
import Data.Conduit import Data.Conduit
import Data.XML.Types (Event) import Data.XML.Types (Event, Name)
import Text.Pandoc
import Data.Text as T import Data.Text as T
-- | This module provide a parser for wikipedia dump.
-- This include an xml parser for wikipedia's xml and an wikimedia to plaintext converter for the wikipedia text field
-- | Use case
-- >>> :{
-- wikimediaFile <- BL.readFile "text.xml"
-- _ <- runConduit $ parseLBS def wikimediaFile .| force "mediawiki required" parseMediawiki .| CL.mapM_ print
-- :}
-- | A simple "Page" type, for the moment it take only text and title (since there is no abstract) will see if other datas are relevant.
data Page = Page data Page = Page
{ {
_title :: T.Text _title :: T.Text
, _text :: Maybe T.Text , _text :: T.Text
} }
deriving (Show) deriving (Show)
runParser :: IO ()
runParser = do
file <- readFile "text.xml"
page <- runConduit $ parseLBS def file .| force "page required" parsePage
print page
parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text) parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
text <- force "text is missing" $ tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}text" content text <- force "text is missing" $ ignoreExcept "{http://www.mediawiki.org/xml/export-0.10/}text" content
many_ $ ignoreAnyTreeContent many_ $ ignoreAnyTreeContent
return text return text
tagUntil :: Name -> NameMatcher Name
tagUntil name = matching (/= name)
-- | Utility function that parse nothing but the tag given, usefull because we have to consume every data.
ignoreExcept :: MonadThrow m => Name -> ConduitT Event o m b -> ConduitT Event o m (Maybe b)
ignoreExcept name f = do
_ <- consumeExcept name
tagIgnoreAttrs (matching (==name)) f
consumeExcept :: MonadThrow m => Name -> ConduitT Event o m ()
consumeExcept = many_ . ignoreTreeContent . tagUntil
parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page) parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
parsePage = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do parsePage = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
title <- force "title is missing" $ tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content title <- force "title is missing" $ tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
revision <- parseRevision _ <- consumeExcept "{http://www.mediawiki.org/xml/export-0.10/}revision"
revision <- force "revision is missing" $ parseRevision
many_ $ ignoreAnyTreeContent many_ $ ignoreAnyTreeContent
return $ Page title revision return $ Page title revision
parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ()) parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
parseMediawiki = tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki" $ manyYield' parsePage parseMediawiki = tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki" $ manyYield' parsePage
-- | Need to wrap the result in IO to parse and to combine it.
mediawikiPageToPlain :: Page -> IO Page
mediawikiPageToPlain page = do
title <- mediaToPlain $ _title page
revision <- mediaToPlain $ _text page
return $ Page title revision
where mediaToPlain media = do
res <- runIO $ do
doc <- readMediaWiki def media
writePlain def doc
handleError res
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment