Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
147
Issues
147
List
Board
Labels
Milestones
Merge Requests
6
Merge Requests
6
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
9cfea968
Commit
9cfea968
authored
Oct 16, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] parse to hyperdataDocuments (todo: optim).
parent
62c8870d
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
92 additions
and
19 deletions
+92
-19
Tree.hs
src/Gargantext/Database/Tree.hs
+0
-1
Node.hs
src/Gargantext/Database/Types/Node.hs
+4
-4
Parsers.hs
src/Gargantext/Text/Parsers.hs
+65
-2
CSV.hs
src/Gargantext/Text/Parsers/CSV.hs
+1
-1
Date.hs
src/Gargantext/Text/Parsers/Date.hs
+5
-4
WOS.hs
src/Gargantext/Text/Parsers/WOS.hs
+7
-4
Stop.hs
src/Gargantext/Text/Terms/Stop.hs
+10
-3
No files found.
src/Gargantext/Database/Tree.hs
View file @
9cfea968
...
...
@@ -66,7 +66,6 @@ toTree' m n =
TreeN
(
toNodeTree
n
)
$
m
^..
at
(
Just
$
dt_nodeId
n
)
.
_Just
.
each
.
to
(
toTree'
m
)
------------------------------------------------------------------------
toNodeTree
::
DbTreeNode
->
NodeTree
toNodeTree
(
DbTreeNode
nId
tId
_
n
)
=
NodeTree
n
nodeType
nId
...
...
src/Gargantext/Database/Types/Node.hs
View file @
9cfea968
...
...
@@ -102,7 +102,7 @@ $(deriveJSON (unPrefix "hyperdataDocumentV3_") ''HyperdataDocumentV3)
------------------------------------------------------------------------
data
HyperdataDocument
=
HyperdataDocument
{
_hyperdataDocument_bdd
::
Maybe
Text
,
_hyperdataDocument_doi
::
Maybe
In
t
,
_hyperdataDocument_doi
::
Maybe
Tex
t
,
_hyperdataDocument_url
::
Maybe
Text
,
_hyperdataDocument_uniqId
::
Maybe
Text
,
_hyperdataDocument_page
::
Maybe
Int
...
...
@@ -113,11 +113,11 @@ data HyperdataDocument = HyperdataDocument { _hyperdataDocument_bdd
,
_hyperdataDocument_publication_date
::
Maybe
Text
,
_hyperdataDocument_publication_year
::
Maybe
Int
,
_hyperdataDocument_publication_month
::
Maybe
Int
,
_hyperdataDocument_publication_day
::
Maybe
Int
,
_hyperdataDocument_publication_hour
::
Maybe
Int
,
_hyperdataDocument_publication_minute
::
Maybe
Int
,
_hyperdataDocument_publication_second
::
Maybe
Int
,
_hyperdataDocument_language_iso2
::
Maybe
Text
,
_hyperdataDocument_language_iso3
::
Maybe
Text
,
_hyperdataDocument_language_iso2
::
Maybe
Text
}
deriving
(
Show
,
Generic
)
$
(
deriveJSON
(
unPrefix
"_hyperdataDocument_"
)
''
H
yperdataDocument
)
$
(
makeLenses
''
H
yperdataDocument
)
...
...
@@ -128,7 +128,7 @@ instance ToField HyperdataDocument where
toHyperdataDocuments
::
[(
Text
,
Text
)]
->
[
HyperdataDocument
]
toHyperdataDocuments
ts
=
map
(
\
(
t1
,
t2
)
->
HyperdataDocument
Nothing
Nothing
Nothing
Nothing
Nothing
(
Just
t1
)
Nothing
(
Just
t2
)
Nothing
Nothing
Nothing
Nothing
Nothing
Nothing
Nothing
Nothing
Nothing
Nothing
Nothing
Nothing
Nothing
Nothing
Nothing
)
ts
hyperdataDocuments
::
[
HyperdataDocument
]
...
...
src/Gargantext/Text/Parsers.hs
View file @
9cfea968
...
...
@@ -20,13 +20,17 @@ please follow the types.
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Text.Parsers
(
parse
,
FileFormat
(
..
),
clean
)
module
Gargantext.Text.Parsers
(
parse
,
FileFormat
(
..
),
clean
,
parseDocs
)
where
import
System.FilePath
(
FilePath
(),
takeExtension
)
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
Control.Monad
(
join
)
import
Data.Time
(
UTCTime
(
..
))
import
qualified
Data.Time
as
DT
import
Data.Either.Extra
(
partitionEithers
)
import
Data.List
(
concat
)
import
qualified
Data.Map
as
DM
...
...
@@ -44,10 +48,15 @@ import Control.Concurrent.Async as CCA (mapConcurrently)
import
Data.Text.Encoding
(
decodeUtf8
)
import
Data.String
(
String
())
import
Data.List
(
lookup
)
------------------------------------------------------------------------
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Prelude
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
Gargantext.Text.Parsers.WOS
(
wosParser
)
import
Gargantext.Text.Parsers.Date
(
parseDate
)
import
Gargantext.Text.Terms.Stop
(
detectLang
)
------------------------------------------------------------------------
type
ParseError
=
String
...
...
@@ -61,7 +70,10 @@ type ParseError = String
-- | According to the format of Input file,
-- different parser are available.
data
FileFormat
=
WOS
-- Implemented (ISI Format)
data
FileFormat
=
WOS
deriving
(
Show
)
-- Implemented (ISI Format)
-- | DOC -- Not Implemented / import Pandoc
-- | ODT -- Not Implemented / import Pandoc
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
...
...
@@ -71,6 +83,57 @@ data FileFormat = WOS -- Implemented (ISI Format)
-- TODO: to debug maybe add the filepath in error message
-- | Parse file into documents
-- TODO manage errors here
parseDocs
::
FileFormat
->
FilePath
->
IO
[
HyperdataDocument
]
parseDocs
format
path
=
do
docs
<-
snd
<$>
parse
format
path
mapM
(
toDoc
format
)
docs
type
Year
=
Int
type
Month
=
Int
type
Day
=
Int
-- | Parse date to Ints
-- TODO add hours, minutes and seconds
parseDate'
::
Lang
->
Maybe
Text
->
IO
(
Maybe
UTCTime
,
(
Maybe
Year
,
Maybe
Month
,
Maybe
Day
))
parseDate'
_
Nothing
=
pure
(
Nothing
,
(
Nothing
,
Nothing
,
Nothing
))
parseDate'
l
(
Just
txt
)
=
do
utcTime
<-
parseDate
l
txt
let
(
UTCTime
day
_
)
=
utcTime
let
(
y
,
m
,
d
)
=
DT
.
toGregorian
day
pure
(
Just
utcTime
,
(
Just
(
fromIntegral
y
),
Just
m
,
Just
d
))
toDoc
::
FileFormat
->
[(
Text
,
Text
)]
->
IO
HyperdataDocument
toDoc
format
d
=
do
let
abstract
=
lookup
"abstract"
d
let
lang
=
maybe
EN
identity
(
join
$
detectLang
<$>
(
fmap
(
DT
.
take
50
)
abstract
))
let
dateToParse
=
DT
.
replace
"-"
" "
<$>
lookup
"PY"
d
<>
Just
" "
<>
lookup
"publication_date"
d
(
utcTime
,
(
pub_year
,
pub_month
,
pub_day
))
<-
parseDate'
lang
dateToParse
pure
$
HyperdataDocument
(
Just
$
DT
.
pack
$
show
format
)
(
lookup
"doi"
d
)
(
lookup
"URL"
d
)
Nothing
Nothing
(
lookup
"title"
d
)
(
lookup
"authors"
d
)
(
lookup
"source"
d
)
(
lookup
"abstract"
d
)
(
fmap
(
DT
.
pack
.
show
)
utcTime
)
(
pub_year
)
(
pub_month
)
(
pub_day
)
Nothing
Nothing
Nothing
(
Just
$
(
DT
.
pack
.
show
)
lang
)
parse
::
FileFormat
->
FilePath
->
IO
([
ParseError
],
[[(
Text
,
Text
)]])
parse
format
path
=
do
files
<-
case
takeExtension
path
of
...
...
src/Gargantext/Text/Parsers/CSV.hs
View file @
9cfea968
...
...
@@ -55,7 +55,7 @@ doc2hyperdataDocument :: Doc -> HyperdataDocument
--doc2hyperdataDocument (Doc did dt ds dpy dpm dpd dab dau) =
doc2hyperdataDocument
(
Doc
did
dt
_
dpy
dpm
dpd
dab
dau
)
=
HyperdataDocument
(
Just
"CSV"
)
(
Just
did
)
(
Just
.
pack
.
show
$
did
)
Nothing
Nothing
Nothing
...
...
src/Gargantext/Text/Parsers/Date.hs
View file @
9cfea968
...
...
@@ -21,7 +21,7 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
module
Gargantext.Text.Parsers.Date
(
parseDate
,
parseDateRaw
)
where
import
Data.HashMap.Strict
as
HM
hiding
(
map
)
import
Data.Text
(
Text
,
unpack
,
splitOn
)
import
Data.Text
(
Text
,
unpack
,
splitOn
,
pack
)
import
Data.Time
(
parseTimeOrError
,
defaultTimeLocale
)
import
Data.Time.Clock
(
UTCTime
,
getCurrentTime
)
import
Data.Time.LocalTime
(
utc
)
...
...
@@ -82,15 +82,16 @@ parserLang EN = DC.EN
-- parseDateRaw :: Context -> Text -> SomeErrorHandling Text
-- TODO error handling
parseDateRaw
::
Lang
->
Text
->
IO
Text
parseDateRaw
::
Lang
->
Text
->
IO
(
Text
)
parseDateRaw
lang
text
=
do
maybeJson
<-
map
jsonValue
<$>
parseDateWithDuckling
lang
text
case
headMay
maybeJson
of
Just
(
Json
.
Object
object
)
->
case
HM
.
lookup
"value"
object
of
Just
(
Json
.
String
date
)
->
pure
date
Just
_
->
panic
"ParseDateRaw ERROR: should be a json String"
Nothing
->
panic
"ParseDateRaw ERROR: no date found"
_
->
panic
"ParseDateRaw ERROR: type error"
Nothing
->
panic
$
"ParseDateRaw ERROR: no date found"
<>
(
pack
.
show
)
lang
<>
" "
<>
text
_
->
panic
$
"ParseDateRaw ERROR: type error"
<>
(
pack
.
show
)
lang
<>
" "
<>
text
-- | Current Time in DucklingTime format
...
...
src/Gargantext/Text/Parsers/WOS.hs
View file @
9cfea968
...
...
@@ -28,10 +28,12 @@ import Data.Attoparsec.ByteString (Parser, try, string
import
Data.Attoparsec.ByteString.Char8
(
anyChar
,
isEndOfLine
)
import
Data.ByteString
(
ByteString
,
concat
)
import
Data.ByteString.Char8
(
pack
)
import
Control.Applicative
--import Gargantext.Types
-------------------------------------------------------------
-- | wosParser parses ISI format from
-- Web Of Science Database
...
...
@@ -48,7 +50,7 @@ notice = start *> fields <* end
where
start
::
Parser
ByteString
start
=
"
\n
PT "
*>
takeTill
isEndOfLine
end
::
Parser
[
Char
]
end
=
manyTill
anyChar
(
string
$
pack
"
\n
ER
\n
"
)
...
...
@@ -75,11 +77,12 @@ lines = many line
translate
::
ByteString
->
ByteString
translate
champs
|
champs
==
"A
U"
=
"author
"
|
champs
==
"A
F"
=
"authors
"
|
champs
==
"TI"
=
"title"
|
champs
==
"SO"
=
"source"
|
champs
==
"DI"
=
"doi"
|
champs
==
"PD"
=
"publication_date"
|
champs
==
"AB"
=
"abstract"
|
otherwise
=
champs
-------------------------------------------------------------
src/Gargantext/Text/Terms/Stop.hs
View file @
9cfea968
...
...
@@ -15,7 +15,7 @@ Main type here is String.
{-# LANGUAGE NoImplicitPrelude #-}
module
Gargantext.Text.Terms.Stop
module
Gargantext.Text.Terms.Stop
(
detectLang
,
detectLangs
,
stopList
)
where
import
GHC.Base
(
Functor
)
...
...
@@ -31,6 +31,7 @@ import qualified Data.Map.Strict as DM
import
Data.String
(
String
)
import
Data.Text
(
Text
)
import
Data.Text
(
pack
,
unpack
)
import
Data.Tuple.Extra
(
both
)
...
...
@@ -90,8 +91,14 @@ data LangWord = LangWord Lang Word
type
LangProba
=
Map
Lang
Double
------------------------------------------------------------------------
detectLangs
::
String
->
[(
Lang
,
Double
)]
detectLangs
s
=
DL
.
reverse
$
DL
.
sortOn
snd
detectLang
::
Text
->
Maybe
Lang
detectLang
=
head
.
map
fst
.
detectLangs
detectLangs
::
Text
->
[(
Lang
,
Double
)]
detectLangs
=
detectLangs'
.
unpack
detectLangs'
::
String
->
[(
Lang
,
Double
)]
detectLangs'
s
=
DL
.
reverse
$
DL
.
sortOn
snd
$
toList
$
detect
(
wordsToBook
[
0
..
2
]
s
)
eventLang
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment