From 5674c9e6bfa16b34badea44478d1d5a6c0b7a1f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alexandre=20Delano=C3=AB?= <devel+git@delanoe.org>
Date: Mon, 11 Oct 2021 18:33:29 +0200
Subject: [PATCH] [FEAT] Wikidata parser example for artistic movements (to be
 generalized) WIP

---
 package.yaml                                  |   2 +
 .../Core/Text/Corpus/Parsers/Date.hs          |  47 ++++---
 .../Core/Text/Corpus/Parsers/Wikidata.hs      | 130 ++++++++++++++++++
 .../Text/Corpus/Parsers/Wikidata/Crawler.hs   |  53 +++++++
 stack.yaml                                    |  10 +-
 5 files changed, 225 insertions(+), 17 deletions(-)
 create mode 100644 src/Gargantext/Core/Text/Corpus/Parsers/Wikidata.hs
 create mode 100644 src/Gargantext/Core/Text/Corpus/Parsers/Wikidata/Crawler.hs

diff --git a/package.yaml b/package.yaml
index f6fa81ff..aa04d5c3 100644
--- a/package.yaml
+++ b/package.yaml
@@ -219,6 +219,7 @@ library:
   - split
   - stemmer
   - swagger2
+  - taggy-lens
   - tagsoup
   - template-haskell
   - temporary
@@ -238,6 +239,7 @@ library:
   - wai-cors
   - wai-extra
   - warp
+  - wikiparsec
   - wreq
   - xml-conduit
   - xml-types
diff --git a/src/Gargantext/Core/Text/Corpus/Parsers/Date.hs b/src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
index 3164b0b2..7c4b814b 100644
--- a/src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
@@ -16,11 +16,13 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
 
 {-# LANGUAGE TypeFamilies #-}
 
-module Gargantext.Core.Text.Corpus.Parsers.Date {-(parse, parseRaw, dateSplit, Year, Month, Day)-} where
+module Gargantext.Core.Text.Corpus.Parsers.Date
+{-(parse, parseRaw, dateSplit, Year, Month, Day)-}
+  where
 
 import Data.Aeson (toJSON, Value)
 import Data.HashMap.Strict as HM hiding (map)
-import Data.Text (Text, unpack, splitOn, pack)
+import Data.Text (Text, unpack, splitOn)
 import Data.Time (parseTimeOrError, defaultTimeLocale, toGregorian)
 import Data.Time.Clock (UTCTime(..), getCurrentTime)
 import Data.Time.LocalTime (utc)
@@ -69,12 +71,21 @@ parse lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
 type DateFormat  = Text
 type DateDefault = Text
 
-parseDate' :: DateFormat -> DateDefault -> Lang -> Text -> IO UTCTime
+parseDate' :: DateFormat
+           -> DateDefault
+           -> Lang
+           -> Text
+           -> IO UTCTime
 parseDate' format def lang s = do
   dateStr' <- parseRaw lang s
-  let dateStr = unpack $ maybe def identity
-                       $ head $ splitOn "." dateStr'
-  pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
+  if dateStr' == ""
+    then getCurrentTime
+    else do
+      let dateStr = unpack
+                  $ maybe def identity
+                  $ head
+                  $ splitOn "." dateStr'
+      pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
 
 
 -- TODO add Paris at Duckling.Locale Region datatype
@@ -91,24 +102,28 @@ parserLang _  = panic "not implemented"
 -- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
 -- parseRaw :: Context -> Text -> SomeErrorHandling Text
 
--- TODO error handling
 parseRaw :: Lang -> Text -> IO Text
 parseRaw lang text = do -- case result
-    maybeResult <- extractValue <$> getTimeValue <$> parseDateWithDuckling lang text (Options True)
+    maybeResult <- extractValue <$> getTimeValue
+                                <$> parseDateWithDuckling lang text (Options True)
     case maybeResult of
       Just result -> pure result
-      Nothing     -> panic $ "[G.C.T.C.P.D.parseRaw] ERROR" <> (pack . show) lang <> " " <> text
-
+      Nothing     -> do
+        printDebug ("[G.C.T.C.P.D.parseRaw] ERROR " <> (cs . show) lang)
+                   text
+        pure ""
 
-getTimeValue :: [ResolvedToken] -> Value
+getTimeValue :: [ResolvedToken] -> Maybe Value
 getTimeValue rt = case head rt of
-  Nothing -> panic "error"
+  Nothing -> do
+    Nothing
   Just x  -> case rval x of
-    RVal Time t -> toJSON t
-    _  -> panic "error2"
+    RVal Time t -> Just $ toJSON t
+    _  -> do
+      Nothing
 
-extractValue :: Value -> Maybe Text
-extractValue (Json.Object object) =
+extractValue :: Maybe Value -> Maybe Text
+extractValue (Just (Json.Object object)) =
   case HM.lookup "value" object of
     Just (Json.String date) -> Just date
     _                  -> Nothing
diff --git a/src/Gargantext/Core/Text/Corpus/Parsers/Wikidata.hs b/src/Gargantext/Core/Text/Corpus/Parsers/Wikidata.hs
new file mode 100644
index 00000000..b6da1168
--- /dev/null
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/Wikidata.hs
@@ -0,0 +1,130 @@
+{-|
+Module      : Gargantext.Core.Text.Corpus.Parsers.Wikidata
+Description : To query Wikidata
+Copyright   : (c) CNRS, 2019-Present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+-}
+
+{-# LANGUAGE TemplateHaskell     #-}
+{-# LANGUAGE ScopedTypeVariables #-}
+
+module Gargantext.Core.Text.Corpus.Parsers.Wikidata where
+
+import Control.Lens (makeLenses, (^.) )
+import Data.Maybe (catMaybes)
+import Data.Text (Text, concat)
+import Database.HSparql.Connection
+import Gargantext.Core (Lang(..))
+import Gargantext.Core.Text.Corpus.Parsers.Isidore (unbound)
+import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
+import Gargantext.Prelude
+import Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
+import Prelude (String)
+import qualified Data.List as List
+import Gargantext.Core.Text.Corpus.Parsers.Date (dateSplit)
+
+
+
+data WikiResult = WikiResult { _wr_cid         :: Maybe Text
+                             , _wr_title       :: Maybe Text
+                             , _wr_url         :: Maybe Text
+                             , _wr_yearStart   :: Maybe Text
+                             , _wr_yearEnd     :: Maybe Text
+                             , _wr_yearFlorish :: Maybe Text
+                             } deriving (Show, Eq)
+$(makeLenses ''WikiResult)
+
+type NumberOfSections = Int
+
+wikidataGet :: Int -> NumberOfSections -> IO [HyperdataDocument]
+wikidataGet n m = do
+  results <- wikidataSelect n
+  mapM (wikiPageToDocument m) results
+
+
+wikiPageToDocument :: NumberOfSections -> WikiResult ->  IO HyperdataDocument
+wikiPageToDocument m wr = do
+
+  sections <- case wr ^. wr_url of
+    Nothing -> pure []
+    Just  u -> crawlPage u
+
+  let bdd    = Just "wikidata"
+      doi    = Nothing
+      url    = (wr ^. wr_url)
+      uniqId    = Nothing
+      uniqIdBdd = Nothing
+      page      = Nothing
+      title     = (wr ^. wr_title)
+      authors    = Nothing
+      institutes = Nothing
+      source     = Nothing
+      abstract   = Just $ concat $ take m sections
+
+  (date, (year, month, day))
+    <- dateSplit EN $ head
+                    $ catMaybes
+                    [ wr ^. wr_yearStart
+                    , wr ^. wr_yearEnd
+                    , wr ^. wr_yearFlorish
+                    , head sections
+                    ]
+
+  let hour = Nothing
+      minute = Nothing
+      second = Nothing
+      iso2   = Just $ cs $ show EN
+
+  pure $ HyperdataDocument bdd doi url uniqId uniqIdBdd
+                           page title authors institutes source
+                           abstract ((cs . show) <$> date) year month day hour minute second iso2
+
+
+wikidataSelect :: Int -> IO [WikiResult]
+wikidataSelect n = do
+  result <- selectQueryRaw wikidataRoute (wikidataQuery n)
+  case result of
+    Nothing      -> pure []
+    Just result' -> pure $ map toWikiResult $ unbound' EN result'
+
+
+unbound' :: Lang -> [[BindingValue]] -> [[Maybe Text]]
+unbound' l = map (map (unbound l))
+
+toWikiResult :: [Maybe Text] -> WikiResult
+toWikiResult (c:t:u:ys:ye:yf:_) = WikiResult c t u ys ye yf
+toWikiResult _                  = panic "[G.C.T.C.Parsers.Wikidata.toWikiResult] error"
+
+wikidataRoute :: EndPoint
+wikidataRoute = "https://query.wikidata.org/sparql"
+
+wikidataQuery :: Int -> String
+wikidataQuery n = List.unlines
+      ["     PREFIX wd:       <http://www.wikidata.org/entity/>"
+      ,"     PREFIX wdt:      <http://www.wikidata.org/prop/direct/>"
+      ,"     PREFIX schema:   <http://schema.org/>"
+      ,"     PREFIX wikibase: <http://wikiba.se/ontology#>"
+      ,"     SELECT DISTINCT "
+      ,"      ?cid"
+      ,"      ?title"
+      ,"      ?url"
+      ,"      (year(xsd:dateTime(?dateStart))   as ?yearStart)"
+      ,"      (year(xsd:dateTime(?dateEnd))     as ?yearEnd)"
+      ,"      (year(xsd:dateTime(?dateFlorish)) as ?yearFlorish) "
+      ,"     WHERE {"
+      ,"       ?cid wdt:P31 wd:Q968159 ."
+      ,"       ?cid rdfs:label ?title filter (lang(?title) = \"en\") ."
+      ,"      "
+      ,"       ?url schema:about ?cid ."
+      ,"       ?url schema:inLanguage \"en\" ."
+      ,"       FILTER (SUBSTR(str(?url), 1, 25) = \"https://en.wikipedia.org/\")"
+      ,"       OPTIONAL {?cid (wdt:P580) ?dateStart   .}"
+      ,"       OPTIONAL {?cid (wdt:P582) ?dateEnd     .}"
+      ,"       OPTIONAL {?cid (wdt:P571) ?dateFlorish .}"
+      ,"     }"
+      ,"       LIMIT " <> (cs $ show n)
+      ]
diff --git a/src/Gargantext/Core/Text/Corpus/Parsers/Wikidata/Crawler.hs b/src/Gargantext/Core/Text/Corpus/Parsers/Wikidata/Crawler.hs
new file mode 100644
index 00000000..3ba4f76b
--- /dev/null
+++ b/src/Gargantext/Core/Text/Corpus/Parsers/Wikidata/Crawler.hs
@@ -0,0 +1,53 @@
+{-|
+Module      : Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
+Description : Some utils to parse dates
+Copyright   : (c) CNRS 2017-present
+License     : AGPL + CECILL v3
+Maintainer  : team@gargantext.org
+Stability   : experimental
+Portability : POSIX
+
+Thx to Alp Well Typed for the first version.
+
+-}
+
+{-# LANGUAGE OverloadedStrings #-}
+
+module Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
+  where
+
+import Control.Lens hiding (element, elements, children)
+import Data.ByteString.Lazy (ByteString)
+import Data.Text (Text, unpack)
+import Data.Text.Encoding.Error (lenientDecode)
+import Data.Text.Lazy.Encoding (decodeUtf8With)
+import Gargantext.Prelude
+import Network.HTTP.Client (Response)
+import Network.Wreq (responseBody, get)
+import Text.Taggy.Lens
+
+
+
+type WikipediaUrlPage = Text
+crawlPage :: WikipediaUrlPage -> IO [Text]
+crawlPage url = do
+  datas <- get (unpack url)
+  pure $ sectionsOf datas
+
+
+sectionsOf :: Response ByteString -> [Text]
+sectionsOf resp =
+  resp ^.. responseBody
+         . to (decodeUtf8With lenientDecode)
+         . html
+         . allAttributed (ix "class" . only "mw-parser-output")
+         . allNamed (only "p")
+         . to paragraphText
+
+paragraphText :: Element -> Text
+paragraphText p = collectTextN (p ^. children)
+  where collectTextN (NodeContent t : ns) = t <> collectTextN ns
+        collectTextN (NodeElement elt : ns) = collectTextE elt <> collectTextN ns
+        collectTextN [] = ""
+
+        collectTextE (Element _ _ ns) = collectTextN ns
diff --git a/stack.yaml b/stack.yaml
index 5a82e6ac..6d5746df 100644
--- a/stack.yaml
+++ b/stack.yaml
@@ -48,7 +48,7 @@ extra-deps:
 - git: https://github.com/delanoe/haskell-opaleye.git
   commit: d3ab7acd5ede737478763630035aa880f7e34444
 - git: https://github.com/delanoe/hsparql.git
-  commit: 308c74b71a1abb0a91546fa57d353131248e3a7f
+  commit: 2acbbc55ac9bbd4bf1a713c586b8b8e8b82892eb
 - git: https://github.com/robstewart57/rdf4h.git
   commit: 4fd2edf30c141600ffad6d730cc4c1c08a6dbce4
 
@@ -85,6 +85,8 @@ extra-deps:
 - git: https://gitlab.iscpif.fr/anoe/accelerate-utility.git
   commit: 83ada76e78ac10d9559af8ed6bd4064ec81308e4
 - accelerate-arithmetic-1.0.0.1@sha256:555639232aa5cad411e89247b27871d09352b987a754230a288c690b6de6d888,2096
+- git: https://github.com/rspeer/wikiparsec.git
+  commit: 9637a82344bb70f7fa8f02e75db3c081ccd434ce
 
   # Others dependencies (using stack resolver)
 - constraints-extras-0.3.1.0@sha256:12016ebb91ad5ed2c82bf7e48c6bd6947d164d33c9dca5ac3965de1bb6c780c0,1777
@@ -112,3 +114,9 @@ extra-deps:
 
 # need Vector.uncons
 - vector-0.12.3.0@sha256:0ae2c1ba86f0077910be242ec6802cc3d7725fe7b2bea6987201aa3737b239b5,7953
+
+# needed for wikiparsec
+- fast-tagsoup-utf8-only-1.0.5@sha256:9292c8ff275c08b88b6013ccc410182552f180904214a07ad4db932ab462aaa1,1651
+# wikipedia crawl
+- taggy-lens-0.1.2@sha256:091ca81d02bd3d7fb493dce0148e1a38f25eb178a1ebd751043a23239e5e3265,3009
+- taggy-0.2.1@sha256:7bc55ddba178971dc6052163597f0445a0a2b5b0ca0e84ce651d53d722e3c265,4662
-- 
2.21.0