Commit 8f2332b3 authored by Mael NICOLAS's avatar Mael NICOLAS

need to fix text parser and apply Pandoc to it,the title parser work

parent 73bccfaf
......@@ -50,6 +50,7 @@ library:
- Gargantext.Text.Metrics.Count
- Gargantext.Text.Parsers.CSV
- Gargantext.Text.Parsers.Date
- Gargantext.Text.Parsers.Wikimedia
- Gargantext.Text.Parsers.WOS
- Gargantext.Text.Search
- Gargantext.Text.Terms
......@@ -150,6 +151,8 @@ library:
- wai-cors
- wai-extra
- warp
- xml-conduit
- xml-types
- yaml
- zip
- zlib
......
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Parsers.Wikimedia where
import Prelude (print)
import Gargantext.Prelude
import Text.XML.Stream.Parse
import Control.Monad.Catch
import Data.ByteString.Lazy
import Data.Conduit
import Data.XML.Types (Event)
import Data.Text as T
data Page = Page
{
_title :: T.Text
, _text :: Maybe T.Text
}
deriving (Show)
runParser :: IO ()
runParser = do
file <- readFile "text.xml"
page <- runConduit $ parseLBS def file .| force "page required" parsePage
print page
parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
text <- force "text is missing" $ tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}text" content
many_ $ ignoreAnyTreeContent
return text
parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
parsePage = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
title <- force "title is missing" $ tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
revision <- parseRevision
many_ $ ignoreAnyTreeContent
return $ Page title revision
parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
parseMediawiki = tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki" $ manyYield' parsePage
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment