Commit 5674c9e6 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] Wikidata parser example for artistic movements (to be generalized) WIP

parent f5bb8c77
......@@ -219,6 +219,7 @@ library:
- split
- stemmer
- swagger2
- taggy-lens
- tagsoup
- template-haskell
- temporary
......@@ -238,6 +239,7 @@ library:
- wai-cors
- wai-extra
- warp
- wikiparsec
- wreq
- xml-conduit
- xml-types
......
......@@ -16,11 +16,13 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
{-# LANGUAGE TypeFamilies #-}
module Gargantext.Core.Text.Corpus.Parsers.Date {-(parse, parseRaw, dateSplit, Year, Month, Day)-} where
module Gargantext.Core.Text.Corpus.Parsers.Date
{-(parse, parseRaw, dateSplit, Year, Month, Day)-}
where
import Data.Aeson (toJSON, Value)
import Data.HashMap.Strict as HM hiding (map)
import Data.Text (Text, unpack, splitOn, pack)
import Data.Text (Text, unpack, splitOn)
import Data.Time (parseTimeOrError, defaultTimeLocale, toGregorian)
import Data.Time.Clock (UTCTime(..), getCurrentTime)
import Data.Time.LocalTime (utc)
......@@ -69,12 +71,21 @@ parse lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
type DateFormat = Text
type DateDefault = Text
parseDate' :: DateFormat -> DateDefault -> Lang -> Text -> IO UTCTime
parseDate' :: DateFormat
-> DateDefault
-> Lang
-> Text
-> IO UTCTime
parseDate' format def lang s = do
dateStr' <- parseRaw lang s
let dateStr = unpack $ maybe def identity
$ head $ splitOn "." dateStr'
pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
if dateStr' == ""
then getCurrentTime
else do
let dateStr = unpack
$ maybe def identity
$ head
$ splitOn "." dateStr'
pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
-- TODO add Paris at Duckling.Locale Region datatype
......@@ -91,24 +102,28 @@ parserLang _ = panic "not implemented"
-- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
-- parseRaw :: Context -> Text -> SomeErrorHandling Text
-- TODO error handling
parseRaw :: Lang -> Text -> IO Text
parseRaw lang text = do -- case result
maybeResult <- extractValue <$> getTimeValue <$> parseDateWithDuckling lang text (Options True)
maybeResult <- extractValue <$> getTimeValue
<$> parseDateWithDuckling lang text (Options True)
case maybeResult of
Just result -> pure result
Nothing -> panic $ "[G.C.T.C.P.D.parseRaw] ERROR" <> (pack . show) lang <> " " <> text
Nothing -> do
printDebug ("[G.C.T.C.P.D.parseRaw] ERROR " <> (cs . show) lang)
text
pure ""
getTimeValue :: [ResolvedToken] -> Value
getTimeValue :: [ResolvedToken] -> Maybe Value
getTimeValue rt = case head rt of
Nothing -> panic "error"
Nothing -> do
Nothing
Just x -> case rval x of
RVal Time t -> toJSON t
_ -> panic "error2"
RVal Time t -> Just $ toJSON t
_ -> do
Nothing
extractValue :: Value -> Maybe Text
extractValue (Json.Object object) =
extractValue :: Maybe Value -> Maybe Text
extractValue (Just (Json.Object object)) =
case HM.lookup "value" object of
Just (Json.String date) -> Just date
_ -> Nothing
......
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata
Description : To query Wikidata
Copyright : (c) CNRS, 2019-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Gargantext.Core.Text.Corpus.Parsers.Wikidata where
import Control.Lens (makeLenses, (^.) )
import Data.Maybe (catMaybes)
import Data.Text (Text, concat)
import Database.HSparql.Connection
import Gargantext.Core (Lang(..))
import Gargantext.Core.Text.Corpus.Parsers.Isidore (unbound)
import Gargantext.Database.Admin.Types.Hyperdata.Document (HyperdataDocument(..))
import Gargantext.Prelude
import Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
import Prelude (String)
import qualified Data.List as List
import Gargantext.Core.Text.Corpus.Parsers.Date (dateSplit)
data WikiResult = WikiResult { _wr_cid :: Maybe Text
, _wr_title :: Maybe Text
, _wr_url :: Maybe Text
, _wr_yearStart :: Maybe Text
, _wr_yearEnd :: Maybe Text
, _wr_yearFlorish :: Maybe Text
} deriving (Show, Eq)
$(makeLenses ''WikiResult)
type NumberOfSections = Int
wikidataGet :: Int -> NumberOfSections -> IO [HyperdataDocument]
wikidataGet n m = do
results <- wikidataSelect n
mapM (wikiPageToDocument m) results
wikiPageToDocument :: NumberOfSections -> WikiResult -> IO HyperdataDocument
wikiPageToDocument m wr = do
sections <- case wr ^. wr_url of
Nothing -> pure []
Just u -> crawlPage u
let bdd = Just "wikidata"
doi = Nothing
url = (wr ^. wr_url)
uniqId = Nothing
uniqIdBdd = Nothing
page = Nothing
title = (wr ^. wr_title)
authors = Nothing
institutes = Nothing
source = Nothing
abstract = Just $ concat $ take m sections
(date, (year, month, day))
<- dateSplit EN $ head
$ catMaybes
[ wr ^. wr_yearStart
, wr ^. wr_yearEnd
, wr ^. wr_yearFlorish
, head sections
]
let hour = Nothing
minute = Nothing
second = Nothing
iso2 = Just $ cs $ show EN
pure $ HyperdataDocument bdd doi url uniqId uniqIdBdd
page title authors institutes source
abstract ((cs . show) <$> date) year month day hour minute second iso2
wikidataSelect :: Int -> IO [WikiResult]
wikidataSelect n = do
result <- selectQueryRaw wikidataRoute (wikidataQuery n)
case result of
Nothing -> pure []
Just result' -> pure $ map toWikiResult $ unbound' EN result'
unbound' :: Lang -> [[BindingValue]] -> [[Maybe Text]]
unbound' l = map (map (unbound l))
toWikiResult :: [Maybe Text] -> WikiResult
toWikiResult (c:t:u:ys:ye:yf:_) = WikiResult c t u ys ye yf
toWikiResult _ = panic "[G.C.T.C.Parsers.Wikidata.toWikiResult] error"
wikidataRoute :: EndPoint
wikidataRoute = "https://query.wikidata.org/sparql"
wikidataQuery :: Int -> String
wikidataQuery n = List.unlines
[" PREFIX wd: <http://www.wikidata.org/entity/>"
," PREFIX wdt: <http://www.wikidata.org/prop/direct/>"
," PREFIX schema: <http://schema.org/>"
," PREFIX wikibase: <http://wikiba.se/ontology#>"
," SELECT DISTINCT "
," ?cid"
," ?title"
," ?url"
," (year(xsd:dateTime(?dateStart)) as ?yearStart)"
," (year(xsd:dateTime(?dateEnd)) as ?yearEnd)"
," (year(xsd:dateTime(?dateFlorish)) as ?yearFlorish) "
," WHERE {"
," ?cid wdt:P31 wd:Q968159 ."
," ?cid rdfs:label ?title filter (lang(?title) = \"en\") ."
," "
," ?url schema:about ?cid ."
," ?url schema:inLanguage \"en\" ."
," FILTER (SUBSTR(str(?url), 1, 25) = \"https://en.wikipedia.org/\")"
," OPTIONAL {?cid (wdt:P580) ?dateStart .}"
," OPTIONAL {?cid (wdt:P582) ?dateEnd .}"
," OPTIONAL {?cid (wdt:P571) ?dateFlorish .}"
," }"
," LIMIT " <> (cs $ show n)
]
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
Description : Some utils to parse dates
Copyright : (c) CNRS 2017-present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Thx to Alp Well Typed for the first version.
-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
where
import Control.Lens hiding (element, elements, children)
import Data.ByteString.Lazy (ByteString)
import Data.Text (Text, unpack)
import Data.Text.Encoding.Error (lenientDecode)
import Data.Text.Lazy.Encoding (decodeUtf8With)
import Gargantext.Prelude
import Network.HTTP.Client (Response)
import Network.Wreq (responseBody, get)
import Text.Taggy.Lens
type WikipediaUrlPage = Text
crawlPage :: WikipediaUrlPage -> IO [Text]
crawlPage url = do
datas <- get (unpack url)
pure $ sectionsOf datas
sectionsOf :: Response ByteString -> [Text]
sectionsOf resp =
resp ^.. responseBody
. to (decodeUtf8With lenientDecode)
. html
. allAttributed (ix "class" . only "mw-parser-output")
. allNamed (only "p")
. to paragraphText
paragraphText :: Element -> Text
paragraphText p = collectTextN (p ^. children)
where collectTextN (NodeContent t : ns) = t <> collectTextN ns
collectTextN (NodeElement elt : ns) = collectTextE elt <> collectTextN ns
collectTextN [] = ""
collectTextE (Element _ _ ns) = collectTextN ns
......@@ -48,7 +48,7 @@ extra-deps:
- git: https://github.com/delanoe/haskell-opaleye.git
commit: d3ab7acd5ede737478763630035aa880f7e34444
- git: https://github.com/delanoe/hsparql.git
commit: 308c74b71a1abb0a91546fa57d353131248e3a7f
commit: 2acbbc55ac9bbd4bf1a713c586b8b8e8b82892eb
- git: https://github.com/robstewart57/rdf4h.git
commit: 4fd2edf30c141600ffad6d730cc4c1c08a6dbce4
......@@ -85,6 +85,8 @@ extra-deps:
- git: https://gitlab.iscpif.fr/anoe/accelerate-utility.git
commit: 83ada76e78ac10d9559af8ed6bd4064ec81308e4
- accelerate-arithmetic-1.0.0.1@sha256:555639232aa5cad411e89247b27871d09352b987a754230a288c690b6de6d888,2096
- git: https://github.com/rspeer/wikiparsec.git
commit: 9637a82344bb70f7fa8f02e75db3c081ccd434ce
# Others dependencies (using stack resolver)
- constraints-extras-0.3.1.0@sha256:12016ebb91ad5ed2c82bf7e48c6bd6947d164d33c9dca5ac3965de1bb6c780c0,1777
......@@ -112,3 +114,9 @@ extra-deps:
# need Vector.uncons
- vector-0.12.3.0@sha256:0ae2c1ba86f0077910be242ec6802cc3d7725fe7b2bea6987201aa3737b239b5,7953
# needed for wikiparsec
- fast-tagsoup-utf8-only-1.0.5@sha256:9292c8ff275c08b88b6013ccc410182552f180904214a07ad4db932ab462aaa1,1651
# wikipedia crawl
- taggy-lens-0.1.2@sha256:091ca81d02bd3d7fb493dce0148e1a38f25eb178a1ebd751043a23239e5e3265,3009
- taggy-0.2.1@sha256:7bc55ddba178971dc6052163597f0445a0a2b5b0ca0e84ce651d53d722e3c265,4662
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment