1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
{-|
Module : Gargantext.Text.Corpus.Parsers.Wikimedia
Description : Parser for Wikimedia dump
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
@Gargantext.Text.Corpus.Parsers.Wikimedia@:
This module provide a parser for wikipedia dump.
This include an xml parser for wikipedia's xml
and an wikimedia to plaintext converter for the wikipedia text field
-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Corpus.Parsers.Wikimedia
where
import Control.Monad.Catch
import Data.Conduit
import Data.Either
import Data.Text as T
import Data.XML.Types (Event, Name)
import Gargantext.Prelude
import Text.Pandoc
import Text.XML.Stream.Parse
-- | Use case
-- :{
-- wikimediaFile <- BL.readFile "text.xml"
-- _ <- runConduit $ parseLBS def wikimediaFile
-- .| force "mediawiki required" parseMediawiki
-- .| CL.mapM mediawikiPageToPlain
-- .| CL.mapM_ print
-- :}
-- | A simple "Page" type.
-- For the moment it takes only text and title
-- (since there is no abstract) will see if other data are relevant.
data Page =
Page { _markupFormat :: MarkupFormat
, _title :: Maybe T.Text
, _text :: Maybe T.Text
}
deriving (Show)
data MarkupFormat = Mediawiki | Plaintext
deriving (Show)
parseRevision :: MonadThrow m => ConduitT Event o m (Maybe T.Text)
parseRevision = tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}revision" $ do
text <- force "text is missing" $ ignoreExcept "{http://www.mediawiki.org/xml/export-0.10/}text" content
many_ ignoreAnyTreeContent
return text
-- | Utility function that matches everything but the tag given
tagUntil :: Name -> NameMatcher Name
tagUntil name = matching (/= name)
-- | Utility function that consumes everything but the tag given
-- usefull because we have to consume every data.
manyTagsUntil_ :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_ = many_ . ignoreTreeContent . tagUntil
manyTagsUntil_' :: MonadThrow m => Name -> ConduitT Event o m ()
manyTagsUntil_' = many_ . ignoreEmptyTag . tagUntil
-- | Utility function that parses nothing but the tag given,
-- usefull because we have to consume every data.
ignoreExcept :: MonadThrow m => Name
-> ConduitT Event o m b
-> ConduitT Event o m (Maybe b)
ignoreExcept name f = do
_ <- manyTagsUntil_ name
tagIgnoreAttrs (matching (== name)) f
-- TODO: remove ignoreExcept to:
-- many ignoreAnyTreeContentUntil "Article"
manyTagsUntil :: MonadThrow m => Name
-> ConduitT Event o m b
-> ConduitT Event o m (Maybe b)
manyTagsUntil name f = do
_ <- manyTagsUntil_ name
tagIgnoreAttrs (matching (== name)) f
parsePage :: MonadThrow m => ConduitT Event o m (Maybe Page)
parsePage =
tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}page" $ do
title <-
tagNoAttr "{http://www.mediawiki.org/xml/export-0.10/}title" content
_ <- manyTagsUntil_ "{http://www.mediawiki.org/xml/export-0.10/}revision"
revision <-
parseRevision
many_ $ ignoreAnyTreeContent
return $ Page Mediawiki title revision
parseMediawiki :: MonadThrow m => ConduitT Event Page m (Maybe ())
parseMediawiki =
tagIgnoreAttrs "{http://www.mediawiki.org/xml/export-0.10/}mediawiki"
$ manyYield' parsePage
-- | Convert a Mediawiki Page to a Plaintext Page.
-- Need to wrap the result in IO to parse and to combine it.
mediawikiPageToPlain :: Page -> IO Page
mediawikiPageToPlain page = do
title <- mediaToPlain $ _title page
revision <- mediaToPlain $ _text page
return $ Page Plaintext title revision
where mediaToPlain media =
case media of
(Nothing) -> return Nothing
(Just med) -> do
res <- runIO $ do
doc <- readMediaWiki def med
writePlain def doc
case res of
(Left _) -> return Nothing
(Right r) -> return $ Just r