Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
157
Issues
157
List
Board
Labels
Milestones
Merge Requests
9
Merge Requests
9
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
5674c9e6
Commit
5674c9e6
authored
Oct 11, 2021
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] Wikidata parser example for artistic movements (to be generalized) WIP
parent
f5bb8c77
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
225 additions
and
17 deletions
+225
-17
package.yaml
package.yaml
+2
-0
Date.hs
src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
+31
-16
Wikidata.hs
src/Gargantext/Core/Text/Corpus/Parsers/Wikidata.hs
+130
-0
Crawler.hs
src/Gargantext/Core/Text/Corpus/Parsers/Wikidata/Crawler.hs
+53
-0
stack.yaml
stack.yaml
+9
-1
No files found.
package.yaml
View file @
5674c9e6
...
...
@@ -219,6 +219,7 @@ library:
-
split
-
stemmer
-
swagger2
-
taggy-lens
-
tagsoup
-
template-haskell
-
temporary
...
...
@@ -238,6 +239,7 @@ library:
-
wai-cors
-
wai-extra
-
warp
-
wikiparsec
-
wreq
-
xml-conduit
-
xml-types
...
...
src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
View file @
5674c9e6
...
...
@@ -16,11 +16,13 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
{-# LANGUAGE TypeFamilies #-}
module
Gargantext.Core.Text.Corpus.Parsers.Date
{-(parse, parseRaw, dateSplit, Year, Month, Day)-}
where
module
Gargantext.Core.Text.Corpus.Parsers.Date
{-(parse, parseRaw, dateSplit, Year, Month, Day)-}
where
import
Data.Aeson
(
toJSON
,
Value
)
import
Data.HashMap.Strict
as
HM
hiding
(
map
)
import
Data.Text
(
Text
,
unpack
,
splitOn
,
pack
)
import
Data.Text
(
Text
,
unpack
,
splitOn
)
import
Data.Time
(
parseTimeOrError
,
defaultTimeLocale
,
toGregorian
)
import
Data.Time.Clock
(
UTCTime
(
..
),
getCurrentTime
)
import
Data.Time.LocalTime
(
utc
)
...
...
@@ -69,12 +71,21 @@ parse lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
type
DateFormat
=
Text
type
DateDefault
=
Text
parseDate'
::
DateFormat
->
DateDefault
->
Lang
->
Text
->
IO
UTCTime
parseDate'
::
DateFormat
->
DateDefault
->
Lang
->
Text
->
IO
UTCTime
parseDate'
format
def
lang
s
=
do
dateStr'
<-
parseRaw
lang
s
let
dateStr
=
unpack
$
maybe
def
identity
$
head
$
splitOn
"."
dateStr'
pure
$
parseTimeOrError
True
defaultTimeLocale
(
unpack
format
)
dateStr
if
dateStr'
==
""
then
getCurrentTime
else
do
let
dateStr
=
unpack
$
maybe
def
identity
$
head
$
splitOn
"."
dateStr'
pure
$
parseTimeOrError
True
defaultTimeLocale
(
unpack
format
)
dateStr
-- TODO add Paris at Duckling.Locale Region datatype
...
...
@@ -91,24 +102,28 @@ parserLang _ = panic "not implemented"
-- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
-- parseRaw :: Context -> Text -> SomeErrorHandling Text
-- TODO error handling
parseRaw
::
Lang
->
Text
->
IO
Text
parseRaw
lang
text
=
do
-- case result
maybeResult
<-
extractValue
<$>
getTimeValue
<$>
parseDateWithDuckling
lang
text
(
Options
True
)
maybeResult
<-
extractValue
<$>
getTimeValue
<$>
parseDateWithDuckling
lang
text
(
Options
True
)
case
maybeResult
of
Just
result
->
pure
result
Nothing
->
panic
$
"[G.C.T.C.P.D.parseRaw] ERROR"
<>
(
pack
.
show
)
lang
<>
" "
<>
text
Nothing
->
do
printDebug
(
"[G.C.T.C.P.D.parseRaw] ERROR "
<>
(
cs
.
show
)
lang
)
text
pure
""
getTimeValue
::
[
ResolvedToken
]
->
Value
getTimeValue
::
[
ResolvedToken
]
->
Maybe
Value
getTimeValue
rt
=
case
head
rt
of
Nothing
->
panic
"error"
Nothing
->
do
Nothing
Just
x
->
case
rval
x
of
RVal
Time
t
->
toJSON
t
_
->
panic
"error2"
RVal
Time
t
->
Just
$
toJSON
t
_
->
do
Nothing
extractValue
::
Value
->
Maybe
Text
extractValue
(
J
son
.
Object
object
)
=
extractValue
::
Maybe
Value
->
Maybe
Text
extractValue
(
J
ust
(
Json
.
Object
object
)
)
=
case
HM
.
lookup
"value"
object
of
Just
(
Json
.
String
date
)
->
Just
date
_
->
Nothing
...
...
src/Gargantext/Core/Text/Corpus/Parsers/Wikidata.hs
0 → 100644
View file @
5674c9e6
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata
Description : To query Wikidata
Copyright : (c) CNRS, 2019-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE ScopedTypeVariables #-}
module
Gargantext.Core.Text.Corpus.Parsers.Wikidata
where
import
Control.Lens
(
makeLenses
,
(
^.
)
)
import
Data.Maybe
(
catMaybes
)
import
Data.Text
(
Text
,
concat
)
import
Database.HSparql.Connection
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Text.Corpus.Parsers.Isidore
(
unbound
)
import
Gargantext.Database.Admin.Types.Hyperdata.Document
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
import
Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
import
Prelude
(
String
)
import
qualified
Data.List
as
List
import
Gargantext.Core.Text.Corpus.Parsers.Date
(
dateSplit
)
data
WikiResult
=
WikiResult
{
_wr_cid
::
Maybe
Text
,
_wr_title
::
Maybe
Text
,
_wr_url
::
Maybe
Text
,
_wr_yearStart
::
Maybe
Text
,
_wr_yearEnd
::
Maybe
Text
,
_wr_yearFlorish
::
Maybe
Text
}
deriving
(
Show
,
Eq
)
$
(
makeLenses
''
W
ikiResult
)
type
NumberOfSections
=
Int
wikidataGet
::
Int
->
NumberOfSections
->
IO
[
HyperdataDocument
]
wikidataGet
n
m
=
do
results
<-
wikidataSelect
n
mapM
(
wikiPageToDocument
m
)
results
wikiPageToDocument
::
NumberOfSections
->
WikiResult
->
IO
HyperdataDocument
wikiPageToDocument
m
wr
=
do
sections
<-
case
wr
^.
wr_url
of
Nothing
->
pure
[]
Just
u
->
crawlPage
u
let
bdd
=
Just
"wikidata"
doi
=
Nothing
url
=
(
wr
^.
wr_url
)
uniqId
=
Nothing
uniqIdBdd
=
Nothing
page
=
Nothing
title
=
(
wr
^.
wr_title
)
authors
=
Nothing
institutes
=
Nothing
source
=
Nothing
abstract
=
Just
$
concat
$
take
m
sections
(
date
,
(
year
,
month
,
day
))
<-
dateSplit
EN
$
head
$
catMaybes
[
wr
^.
wr_yearStart
,
wr
^.
wr_yearEnd
,
wr
^.
wr_yearFlorish
,
head
sections
]
let
hour
=
Nothing
minute
=
Nothing
second
=
Nothing
iso2
=
Just
$
cs
$
show
EN
pure
$
HyperdataDocument
bdd
doi
url
uniqId
uniqIdBdd
page
title
authors
institutes
source
abstract
((
cs
.
show
)
<$>
date
)
year
month
day
hour
minute
second
iso2
wikidataSelect
::
Int
->
IO
[
WikiResult
]
wikidataSelect
n
=
do
result
<-
selectQueryRaw
wikidataRoute
(
wikidataQuery
n
)
case
result
of
Nothing
->
pure
[]
Just
result'
->
pure
$
map
toWikiResult
$
unbound'
EN
result'
unbound'
::
Lang
->
[[
BindingValue
]]
->
[[
Maybe
Text
]]
unbound'
l
=
map
(
map
(
unbound
l
))
toWikiResult
::
[
Maybe
Text
]
->
WikiResult
toWikiResult
(
c
:
t
:
u
:
ys
:
ye
:
yf
:
_
)
=
WikiResult
c
t
u
ys
ye
yf
toWikiResult
_
=
panic
"[G.C.T.C.Parsers.Wikidata.toWikiResult] error"
wikidataRoute
::
EndPoint
wikidataRoute
=
"https://query.wikidata.org/sparql"
wikidataQuery
::
Int
->
String
wikidataQuery
n
=
List
.
unlines
[
" PREFIX wd: <http://www.wikidata.org/entity/>"
,
" PREFIX wdt: <http://www.wikidata.org/prop/direct/>"
,
" PREFIX schema: <http://schema.org/>"
,
" PREFIX wikibase: <http://wikiba.se/ontology#>"
,
" SELECT DISTINCT "
,
" ?cid"
,
" ?title"
,
" ?url"
,
" (year(xsd:dateTime(?dateStart)) as ?yearStart)"
,
" (year(xsd:dateTime(?dateEnd)) as ?yearEnd)"
,
" (year(xsd:dateTime(?dateFlorish)) as ?yearFlorish) "
,
" WHERE {"
,
" ?cid wdt:P31 wd:Q968159 ."
,
" ?cid rdfs:label ?title filter (lang(?title) =
\"
en
\"
) ."
,
" "
,
" ?url schema:about ?cid ."
,
" ?url schema:inLanguage
\"
en
\"
."
,
" FILTER (SUBSTR(str(?url), 1, 25) =
\"
https://en.wikipedia.org/
\"
)"
,
" OPTIONAL {?cid (wdt:P580) ?dateStart .}"
,
" OPTIONAL {?cid (wdt:P582) ?dateEnd .}"
,
" OPTIONAL {?cid (wdt:P571) ?dateFlorish .}"
,
" }"
,
" LIMIT "
<>
(
cs
$
show
n
)
]
src/Gargantext/Core/Text/Corpus/Parsers/Wikidata/Crawler.hs
0 → 100644
View file @
5674c9e6
{-|
Module : Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
Description : Some utils to parse dates
Copyright : (c) CNRS 2017-present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Thx to Alp Well Typed for the first version.
-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Core.Text.Corpus.Parsers.Wikidata.Crawler
where
import
Control.Lens
hiding
(
element
,
elements
,
children
)
import
Data.ByteString.Lazy
(
ByteString
)
import
Data.Text
(
Text
,
unpack
)
import
Data.Text.Encoding.Error
(
lenientDecode
)
import
Data.Text.Lazy.Encoding
(
decodeUtf8With
)
import
Gargantext.Prelude
import
Network.HTTP.Client
(
Response
)
import
Network.Wreq
(
responseBody
,
get
)
import
Text.Taggy.Lens
type
WikipediaUrlPage
=
Text
crawlPage
::
WikipediaUrlPage
->
IO
[
Text
]
crawlPage
url
=
do
datas
<-
get
(
unpack
url
)
pure
$
sectionsOf
datas
sectionsOf
::
Response
ByteString
->
[
Text
]
sectionsOf
resp
=
resp
^..
responseBody
.
to
(
decodeUtf8With
lenientDecode
)
.
html
.
allAttributed
(
ix
"class"
.
only
"mw-parser-output"
)
.
allNamed
(
only
"p"
)
.
to
paragraphText
paragraphText
::
Element
->
Text
paragraphText
p
=
collectTextN
(
p
^.
children
)
where
collectTextN
(
NodeContent
t
:
ns
)
=
t
<>
collectTextN
ns
collectTextN
(
NodeElement
elt
:
ns
)
=
collectTextE
elt
<>
collectTextN
ns
collectTextN
[]
=
""
collectTextE
(
Element
_
_
ns
)
=
collectTextN
ns
stack.yaml
View file @
5674c9e6
...
...
@@ -48,7 +48,7 @@ extra-deps:
-
git
:
https://github.com/delanoe/haskell-opaleye.git
commit
:
d3ab7acd5ede737478763630035aa880f7e34444
-
git
:
https://github.com/delanoe/hsparql.git
commit
:
308c74b71a1abb0a91546fa57d353131248e3a7f
commit
:
2acbbc55ac9bbd4bf1a713c586b8b8e8b82892eb
-
git
:
https://github.com/robstewart57/rdf4h.git
commit
:
4fd2edf30c141600ffad6d730cc4c1c08a6dbce4
...
...
@@ -85,6 +85,8 @@ extra-deps:
-
git
:
https://gitlab.iscpif.fr/anoe/accelerate-utility.git
commit
:
83ada76e78ac10d9559af8ed6bd4064ec81308e4
-
accelerate-arithmetic-1.0.0.1@sha256:555639232aa5cad411e89247b27871d09352b987a754230a288c690b6de6d888,2096
-
git
:
https://github.com/rspeer/wikiparsec.git
commit
:
9637a82344bb70f7fa8f02e75db3c081ccd434ce
# Others dependencies (using stack resolver)
-
constraints-extras-0.3.1.0@sha256:12016ebb91ad5ed2c82bf7e48c6bd6947d164d33c9dca5ac3965de1bb6c780c0,1777
...
...
@@ -112,3 +114,9 @@ extra-deps:
# need Vector.uncons
-
vector-0.12.3.0@sha256:0ae2c1ba86f0077910be242ec6802cc3d7725fe7b2bea6987201aa3737b239b5,7953
# needed for wikiparsec
-
fast-tagsoup-utf8-only-1.0.5@sha256:9292c8ff275c08b88b6013ccc410182552f180904214a07ad4db932ab462aaa1,1651
# wikipedia crawl
-
taggy-lens-0.1.2@sha256:091ca81d02bd3d7fb493dce0148e1a38f25eb178a1ebd751043a23239e5e3265,3009
-
taggy-0.2.1@sha256:7bc55ddba178971dc6052163597f0445a0a2b5b0ca0e84ce651d53d722e3c265,4662
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment