Commit c341cb5d authored by Mael NICOLAS's avatar Mael NICOLAS

Parse and convert text and title of wikipedia dump, #4

parent 8f2332b3
......@@ -108,6 +108,7 @@ library:
- mtl
- natural-transformation
- opaleye
- pandoc
- parsec
- path
- path-io
......
......@@ -2,41 +2,67 @@
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Parsers.Wikimedia where
import Prelude (print)
import Gargantext.Prelude
import Text.XML.Stream.Parse
import Control.Monad.Catch
import Data.ByteString.Lazy
import Data.Conduit
import Data.XML.Types (Event)
import Data.XML.Types (Event, Name)
import Text.Pandoc
import Data.Text as T
-- | This module provide a parser for wikipedia dump.
-- This include an xml parser for wikipedia's xml and an wikimedia to plaintext converter for the wikipedia text field
-- | Use case
-- >>> :{
-- wikimediaFile <- BL.readFile "text.xml"
-- _ <- runConduit $ parseLBS def wikimediaFile .| force "mediawiki required" parseMediawiki .| CL.mapM_ print
-- :}
-- | A simple "Page" type, for the moment it take only text and title (since there is no abstract) will see if other datas are relevant.
data Page = Page
{
_title :: T.Text
, _text :: Maybe T.Text
, _text :: T.Text
}
deriving (Show)
runParser :: IO ()
runParser = do
file <- readFile "text.xml"
page <- runConduit $ parseLBS def file .| force "page required" parsePage
print page
parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
text <- force "text is missing" $ tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}text" content
text <- force "text is missing" $ ignoreExcept "{http://www.mediawiki.org/xml/export-0.10/}text" content
many_ $ ignoreAnyTreeContent
return text
tagUntil :: Name -> NameMatcher Name
tagUntil name = matching (/= name)
-- | Utility function that parse nothing but the tag given, usefull because we have to consume every data.
ignoreExcept :: MonadThrow m => Name -> ConduitT Event o m b -> ConduitT Event o m (Maybe b)
ignoreExcept name f = do
_ <- consumeExcept name
tagIgnoreAttrs (matching (==name)) f
consumeExcept :: MonadThrow m => Name -> ConduitT Event o m ()
consumeExcept = many_ . ignoreTreeContent . tagUntil
parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
parsePage = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
title <- force "title is missing" $ tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
revision <- parseRevision
_ <- consumeExcept "{http://www.mediawiki.org/xml/export-0.10/}revision"
revision <- force "revision is missing" $ parseRevision
many_ $ ignoreAnyTreeContent
return $ Page title revision
parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
parseMediawiki = tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki" $ manyYield' parsePage
-- | Need to wrap the result in IO to parse and to combine it.
mediawikiPageToPlain :: Page -> IO Page
mediawikiPageToPlain page = do
title <- mediaToPlain $ _title page
revision <- mediaToPlain $ _text page
return $ Page title revision
where mediaToPlain media = do
res <- runIO $ do
doc <- readMediaWiki def media
writePlain def doc
handleError res
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment