From 5674c9e6bfa16b34badea44478d1d5a6c0b7a1f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20Delano=C3=AB?= <devel+git@delanoe.org> Date: Mon, 11 Oct 2021 18:33:29 +0200 Subject: [PATCH] [FEAT] Wikidata parser example for artistic movements (to be generalized) WIP --- package.yaml | 2 + .../Core/Text/Corpus/Parsers/Date.hs | 47 ++++--- .../Core/Text/Corpus/Parsers/Wikidata.hs | 130 ++++++++++++++++++ .../Text/Corpus/Parsers/Wikidata/Crawler.hs | 53 +++++++ stack.yaml | 10 +- 5 files changed, 225 insertions(+), 17 deletions(-) create mode 100644 src/Gargantext/Core/Text/Corpus/Parsers/Wikidata.hs create mode 100644 src/Gargantext/Core/Text/Corpus/Parsers/Wikidata/Crawler.hs diff --git a/package.yaml b/package.yaml index f6fa81ff..aa04d5c3 100644 --- a/package.yaml +++ b/package.yaml @@ -219,6 +219,7 @@ library: - split - stemmer - swagger2 + - taggy-lens - tagsoup - template-haskell - temporary @@ -238,6 +239,7 @@ library: - wai-cors - wai-extra - warp + - wikiparsec - wreq - xml-conduit - xml-types diff --git a/src/Gargantext/Core/Text/Corpus/Parsers/Date.hs b/src/Gargantext/Core/Text/Corpus/Parsers/Date.hs index 3164b0b2..7c4b814b 100644 --- a/src/Gargantext/Core/Text/Corpus/Parsers/Date.hs +++ b/src/Gargantext/Core/Text/Corpus/Parsers/Date.hs @@ -16,11 +16,13 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00" {-# LANGUAGE TypeFamilies #-} -module Gargantext.Core.Text.Corpus.Parsers.Date {-(parse, parseRaw, dateSplit, Year, Month, Day)-} where +module Gargantext.Core.Text.Corpus.Parsers.Date +{-(parse, parseRaw, dateSplit, Year, Month, Day)-} + where import Data.Aeson (toJSON, Value) import Data.HashMap.Strict as HM hiding (map) -import Data.Text (Text, unpack, splitOn, pack) +import Data.Text (Text, unpack, splitOn) import Data.Time (parseTimeOrError, defaultTimeLocale, toGregorian) import Data.Time.Clock (UTCTime(..), getCurrentTime) import Data.Time.LocalTime (utc) @@ -69,12 +71,21 @@ parse lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s type DateFormat = Text type DateDefault = Text -parseDate' :: DateFormat -> DateDefault -> Lang -> Text -> IO UTCTime +parseDate' :: DateFormat + -> DateDefault + -> Lang + -> Text + -> IO UTCTime parseDate' format def lang s = do dateStr' <- parseRaw lang s - let dateStr = unpack $ maybe def identity - $ head $ splitOn "." dateStr' - pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr + if dateStr' == "" + then getCurrentTime + else do + let dateStr = unpack + $ maybe def identity + $ head + $ splitOn "." dateStr' + pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr -- TODO add Paris at Duckling.Locale Region datatype @@ -91,24 +102,28 @@ parserLang _ = panic "not implemented" -- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime -- parseRaw :: Context -> Text -> SomeErrorHandling Text --- TODO error handling parseRaw :: Lang -> Text -> IO Text parseRaw lang text = do -- case result - maybeResult <- extractValue <$> getTimeValue <$> parseDateWithDuckling lang text (Options True) + maybeResult <- extractValue <$> getTimeValue + <$> parseDateWithDuckling lang text (Options True) case maybeResult of Just result -> pure result - Nothing -> panic $ "[G.C.T.C.P.D.parseRaw] ERROR" <> (pack . show) lang <> " " <> text - + Nothing -> do + printDebug ("[G.C.T.C.P.D.parseRaw] ERROR " <> (cs . show) lang) + text + pure "" -getTimeValue :: [ResolvedToken] -> Value +getTimeValue :: [ResolvedToken] -> Maybe Value getTimeValue rt = case head rt of - Nothing -> panic "error" + Nothing -> do + Nothing Just x -> case rval x of - RVal Time t -> toJSON t - _ -> panic "error2" + RVal Time t -> Just $ toJSON t + _ -> do + Nothing -extractValue :: Value -> Maybe Text -extractValue (Json.Object object) = +extractValue :: Maybe Value -> Maybe Text +extractValue (Just (Json.Object object)) = case HM.lookup "value" object of Just (Json.String date) -> Just date _ -> Nothing diff --git a/src/Gargantext/Core/Text/Corpus/Parsers/Wikidata.hs b/src/Gargantext/Core/Text/Corpus/Parsers/Wikidata.hs new file mode 100644 index 00000000..b6da1168 --- /dev/null +++ b/src/Gargantext/Core/Text/Corpus/Parsers/Wikidata.hs @@ -0,0 +1,130 @@ +{-| +Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata +Description : To query Wikidata +Copyright : (c) CNRS, 2019-Present +License : AGPL + CECILL v3 +Maintainer : team@gargantext.org +Stability : experimental +Portability : POSIX + +-} + +{-# LANGUAGE TemplateHaskell #-} +{-# LANGUAGE ScopedTypeVariables #-} + +module Gargantext.Core.Text.Corpus.Parsers.Wikidata where + +import Control.Lens (makeLenses, (^.) ) +import Data.Maybe (catMaybes) +import Data.Text (Text, concat) +import Database.HSparql.Connection +import Gargantext.Core (Lang(..)) +import Gargantext.Core.Text.Corpus.Parsers.Isidore (unbound) +import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..)) +import Gargantext.Prelude +import Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler +import Prelude (String) +import qualified Data.List as List +import Gargantext.Core.Text.Corpus.Parsers.Date (dateSplit) + + + +data WikiResult = WikiResult { _wr_cid :: Maybe Text + , _wr_title :: Maybe Text + , _wr_url :: Maybe Text + , _wr_yearStart :: Maybe Text + , _wr_yearEnd :: Maybe Text + , _wr_yearFlorish :: Maybe Text + } deriving (Show, Eq) +$(makeLenses ''WikiResult) + +type NumberOfSections = Int + +wikidataGet :: Int -> NumberOfSections -> IO [HyperdataDocument] +wikidataGet n m = do + results <- wikidataSelect n + mapM (wikiPageToDocument m) results + + +wikiPageToDocument :: NumberOfSections -> WikiResult -> IO HyperdataDocument +wikiPageToDocument m wr = do + + sections <- case wr ^. wr_url of + Nothing -> pure [] + Just u -> crawlPage u + + let bdd = Just "wikidata" + doi = Nothing + url = (wr ^. wr_url) + uniqId = Nothing + uniqIdBdd = Nothing + page = Nothing + title = (wr ^. wr_title) + authors = Nothing + institutes = Nothing + source = Nothing + abstract = Just $ concat $ take m sections + + (date, (year, month, day)) + <- dateSplit EN $ head + $ catMaybes + [ wr ^. wr_yearStart + , wr ^. wr_yearEnd + , wr ^. wr_yearFlorish + , head sections + ] + + let hour = Nothing + minute = Nothing + second = Nothing + iso2 = Just $ cs $ show EN + + pure $ HyperdataDocument bdd doi url uniqId uniqIdBdd + page title authors institutes source + abstract ((cs . show) <$> date) year month day hour minute second iso2 + + +wikidataSelect :: Int -> IO [WikiResult] +wikidataSelect n = do + result <- selectQueryRaw wikidataRoute (wikidataQuery n) + case result of + Nothing -> pure [] + Just result' -> pure $ map toWikiResult $ unbound' EN result' + + +unbound' :: Lang -> [[BindingValue]] -> [[Maybe Text]] +unbound' l = map (map (unbound l)) + +toWikiResult :: [Maybe Text] -> WikiResult +toWikiResult (c:t:u:ys:ye:yf:_) = WikiResult c t u ys ye yf +toWikiResult _ = panic "[G.C.T.C.Parsers.Wikidata.toWikiResult] error" + +wikidataRoute :: EndPoint +wikidataRoute = "https://query.wikidata.org/sparql" + +wikidataQuery :: Int -> String +wikidataQuery n = List.unlines + [" PREFIX wd: <http://www.wikidata.org/entity/>" + ," PREFIX wdt: <http://www.wikidata.org/prop/direct/>" + ," PREFIX schema: <http://schema.org/>" + ," PREFIX wikibase: <http://wikiba.se/ontology#>" + ," SELECT DISTINCT " + ," ?cid" + ," ?title" + ," ?url" + ," (year(xsd:dateTime(?dateStart)) as ?yearStart)" + ," (year(xsd:dateTime(?dateEnd)) as ?yearEnd)" + ," (year(xsd:dateTime(?dateFlorish)) as ?yearFlorish) " + ," WHERE {" + ," ?cid wdt:P31 wd:Q968159 ." + ," ?cid rdfs:label ?title filter (lang(?title) = \"en\") ." + ," " + ," ?url schema:about ?cid ." + ," ?url schema:inLanguage \"en\" ." + ," FILTER (SUBSTR(str(?url), 1, 25) = \"https://en.wikipedia.org/\")" + ," OPTIONAL {?cid (wdt:P580) ?dateStart .}" + ," OPTIONAL {?cid (wdt:P582) ?dateEnd .}" + ," OPTIONAL {?cid (wdt:P571) ?dateFlorish .}" + ," }" + ," LIMIT " <> (cs $ show n) + ] diff --git a/src/Gargantext/Core/Text/Corpus/Parsers/Wikidata/Crawler.hs b/src/Gargantext/Core/Text/Corpus/Parsers/Wikidata/Crawler.hs new file mode 100644 index 00000000..3ba4f76b --- /dev/null +++ b/src/Gargantext/Core/Text/Corpus/Parsers/Wikidata/Crawler.hs @@ -0,0 +1,53 @@ +{-| +Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler +Description : Some utils to parse dates +Copyright : (c) CNRS 2017-present +License : AGPL + CECILL v3 +Maintainer : team@gargantext.org +Stability : experimental +Portability : POSIX + +Thx to Alp Well Typed for the first version. + +-} + +{-# LANGUAGE OverloadedStrings #-} + +module Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler + where + +import Control.Lens hiding (element, elements, children) +import Data.ByteString.Lazy (ByteString) +import Data.Text (Text, unpack) +import Data.Text.Encoding.Error (lenientDecode) +import Data.Text.Lazy.Encoding (decodeUtf8With) +import Gargantext.Prelude +import Network.HTTP.Client (Response) +import Network.Wreq (responseBody, get) +import Text.Taggy.Lens + + + +type WikipediaUrlPage = Text +crawlPage :: WikipediaUrlPage -> IO [Text] +crawlPage url = do + datas <- get (unpack url) + pure $ sectionsOf datas + + +sectionsOf :: Response ByteString -> [Text] +sectionsOf resp = + resp ^.. responseBody + . to (decodeUtf8With lenientDecode) + . html + . allAttributed (ix "class" . only "mw-parser-output") + . allNamed (only "p") + . to paragraphText + +paragraphText :: Element -> Text +paragraphText p = collectTextN (p ^. children) + where collectTextN (NodeContent t : ns) = t <> collectTextN ns + collectTextN (NodeElement elt : ns) = collectTextE elt <> collectTextN ns + collectTextN [] = "" + + collectTextE (Element _ _ ns) = collectTextN ns diff --git a/stack.yaml b/stack.yaml index 5a82e6ac..6d5746df 100644 --- a/stack.yaml +++ b/stack.yaml @@ -48,7 +48,7 @@ extra-deps: - git: https://github.com/delanoe/haskell-opaleye.git commit: d3ab7acd5ede737478763630035aa880f7e34444 - git: https://github.com/delanoe/hsparql.git - commit: 308c74b71a1abb0a91546fa57d353131248e3a7f + commit: 2acbbc55ac9bbd4bf1a713c586b8b8e8b82892eb - git: https://github.com/robstewart57/rdf4h.git commit: 4fd2edf30c141600ffad6d730cc4c1c08a6dbce4 @@ -85,6 +85,8 @@ extra-deps: - git: https://gitlab.iscpif.fr/anoe/accelerate-utility.git commit: 83ada76e78ac10d9559af8ed6bd4064ec81308e4 - accelerate-arithmetic-1.0.0.1@sha256:555639232aa5cad411e89247b27871d09352b987a754230a288c690b6de6d888,2096 +- git: https://github.com/rspeer/wikiparsec.git + commit: 9637a82344bb70f7fa8f02e75db3c081ccd434ce # Others dependencies (using stack resolver) - constraints-extras-0.3.1.0@sha256:12016ebb91ad5ed2c82bf7e48c6bd6947d164d33c9dca5ac3965de1bb6c780c0,1777 @@ -112,3 +114,9 @@ extra-deps: # need Vector.uncons - vector-0.12.3.0@sha256:0ae2c1ba86f0077910be242ec6802cc3d7725fe7b2bea6987201aa3737b239b5,7953 + +# needed for wikiparsec +- fast-tagsoup-utf8-only-1.0.5@sha256:9292c8ff275c08b88b6013ccc410182552f180904214a07ad4db932ab462aaa1,1651 +# wikipedia crawl +- taggy-lens-0.1.2@sha256:091ca81d02bd3d7fb493dce0148e1a38f25eb178a1ebd751043a23239e5e3265,3009 +- taggy-0.2.1@sha256:7bc55ddba178971dc6052163597f0445a0a2b5b0ca0e84ce651d53d722e3c265,4662 -- 2.21.0