Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Christian Merten
haskell-gargantext
Commits
110c6265
Commit
110c6265
authored
May 16, 2019
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'dev-csv' into dev
parents
a7ed762a
68365f40
Changes
10
Show whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
165 additions
and
114 deletions
+165
-114
CleanCsvCorpus.hs
bin/gargantext-cli/CleanCsvCorpus.hs
+9
-10
package.yaml
package.yaml
+1
-0
Upload.hs
src/Gargantext/API/Upload.hs
+30
-13
Flow.hs
src/Gargantext/Database/Flow.hs
+2
-2
IMT.hs
src/Gargantext/Ext/IMT.hs
+1
-1
Convert.hs
src/Gargantext/Text/Convert.hs
+6
-4
Parsers.hs
src/Gargantext/Text/Parsers.hs
+25
-38
CSV.hs
src/Gargantext/Text/Parsers/CSV.hs
+57
-32
Date.hs
src/Gargantext/Text/Parsers/Date.hs
+32
-12
Json2Csv.hs
src/Gargantext/Text/Parsers/Json2Csv.hs
+2
-2
No files found.
bin/gargantext-cli/CleanCsvCorpus.hs
View file @
110c6265
...
@@ -24,14 +24,13 @@ import qualified Data.Vector as V
...
@@ -24,14 +24,13 @@ import qualified Data.Vector as V
import
Gargantext.Prelude
import
Gargantext.Prelude
import
Gargantext.Text.Search
import
Gargantext.Text.Search
import
Gargantext.Text.Parsers.CSV
import
qualified
Gargantext.Text.Parsers.CSV
as
CSV
------------------------------------------------------------------------
------------------------------------------------------------------------
type
Query
=
[
S
.
Term
]
type
Query
=
[
S
.
Term
]
filterDocs
::
[
DocId
]
->
Vector
Doc
->
Vector
Doc
filterDocs
::
[
DocId
]
->
Vector
CSV
.
Doc
->
Vector
CSV
.
Doc
filterDocs
docIds
=
V
.
filter
(
\
doc
->
S
.
member
(
d_docId
doc
)
$
S
.
fromList
docIds
)
filterDocs
docIds
=
V
.
filter
(
\
doc
->
S
.
member
(
CSV
.
d_docId
doc
)
$
S
.
fromList
docIds
)
main
::
IO
()
main
::
IO
()
...
@@ -41,17 +40,17 @@ main = do
...
@@ -41,17 +40,17 @@ main = do
--let q = ["water", "scarcity", "morocco", "shortage","flood"]
--let q = ["water", "scarcity", "morocco", "shortage","flood"]
let
q
=
[
"gratuit"
,
"gratuité"
,
"culture"
,
"culturel"
]
let
q
=
[
"gratuit"
,
"gratuité"
,
"culture"
,
"culturel"
]
(
h
,
csvDocs
)
<-
readCsv
rPath
(
h
,
csvDocs
)
<-
CSV
.
readFile
rPath
putStrLn
$
"Number of documents before:"
<>
show
(
V
.
length
csvDocs
)
putStrLn
$
"Number of documents before:"
<>
show
(
V
.
length
csvDocs
)
putStrLn
$
"Mean size of docs:"
<>
show
(
docsSize
csvDocs
)
putStrLn
$
"Mean size of docs:"
<>
show
(
CSV
.
docsSize
csvDocs
)
let
docs
=
toDocs
csvDocs
let
docs
=
CSV
.
toDocs
csvDocs
let
engine
=
insertDocs
docs
initialDocSearchEngine
let
engine
=
insertDocs
docs
initialDocSearchEngine
let
docIds
=
S
.
query
engine
(
map
pack
q
)
let
docIds
=
S
.
query
engine
(
map
pack
q
)
let
docs'
=
fromDocs
$
filterDocs
docIds
(
V
.
fromList
docs
)
let
docs'
=
CSV
.
fromDocs
$
filterDocs
docIds
(
V
.
fromList
docs
)
putStrLn
$
"Number of documents after:"
<>
show
(
V
.
length
docs'
)
putStrLn
$
"Number of documents after:"
<>
show
(
V
.
length
docs'
)
putStrLn
$
"Mean size of docs:"
<>
show
(
docsSize
docs'
)
putStrLn
$
"Mean size of docs:"
<>
show
(
CSV
.
docsSize
docs'
)
writeCsv
wPath
(
h
,
docs'
)
CSV
.
writeFile
wPath
(
h
,
docs'
)
package.yaml
View file @
110c6265
...
@@ -162,6 +162,7 @@ library:
...
@@ -162,6 +162,7 @@ library:
-
servant-swagger
-
servant-swagger
-
servant-swagger-ui
-
servant-swagger-ui
-
servant-static-th
-
servant-static-th
-
servant-cassava
-
serialise
-
serialise
-
split
-
split
-
stemmer
-
stemmer
...
...
src/Gargantext/API/Upload.hs
View file @
110c6265
...
@@ -25,17 +25,20 @@ Portability : POSIX
...
@@ -25,17 +25,20 @@ Portability : POSIX
module
Gargantext.API.Upload
module
Gargantext.API.Upload
where
where
import
qualified
Data.Text
as
Text
import
GHC.Generics
(
Generic
)
import
Gargantext.Prelude
import
Gargantext.Prelude
import
Data.Text
(
Text
)
import
Data.Text
(
Text
)
import
Data.Aeson
import
Servant
import
Servant
import
Servant.Multipart
import
Servant.Multipart
--import Servant.Mock (HasMock(mock))
--import Servant.Mock (HasMock(mock))
import
Servant.Swagger
(
HasSwagger
(
toSwagger
))
import
Servant.Swagger
(
HasSwagger
(
toSwagger
))
import
qualified
Data.ByteString.Lazy
as
LBS
--
import qualified Data.ByteString.Lazy as LBS
import
Control.Monad
import
Control.Monad
import
Control.Monad.IO.Class
import
Control.Monad.IO.Class
import
Gargantext.API.Types
import
Gargantext.API.Types
--import Servant.CSV.Cassava (CSV'(..))
--import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
--import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
--import Data.Swagger
--import Data.Swagger
--import Gargantext.API.Ngrams (TODO)
--import Gargantext.API.Ngrams (TODO)
...
@@ -67,22 +70,36 @@ instance HasMock (MultipartForm Mem (MultipartData Mem) :> sub) context where
...
@@ -67,22 +70,36 @@ instance HasMock (MultipartForm Mem (MultipartData Mem) :> sub) context where
mock _ _ = undefined
mock _ _ = undefined
-}
-}
type
ApiUpload
=
MultipartForm
Mem
(
MultipartData
Mem
)
:>
Post
'[
J
SON
]
Integer
data
Upload
=
Upload
{
up
::
[
Text
]
}
deriving
(
Generic
)
instance
ToJSON
Upload
type
ApiUpload
=
MultipartForm
Mem
(
MultipartData
Mem
)
:>
Post
'[
J
SON
]
Text
-- MultipartData consists in textual inputs,
-- MultipartData consists in textual inputs,
-- accessible through its "inputs" field, as well
-- accessible through its "inputs" field, as well
-- as files, accessible through its "files" field.
-- as files, accessible through its "files" field.
upload
::
GargServer
ApiUpload
upload
::
GargServer
ApiUpload
upload
multipartData
=
do
upload
multipartData
=
do
liftIO
$
do
--{-
is
<-
liftIO
$
do
putStrLn
(
"Inputs:"
::
Text
)
putStrLn
(
"Inputs:"
::
Text
)
forM_
(
inputs
multipartData
)
$
\
input
->
forM
(
inputs
multipartData
)
$
\
input
->
do
putStrLn
$
(
" "
::
Text
)
<>
(
iName
input
)
putStrLn
$
(
"iName "
::
Text
)
<>
(
iName
input
)
<>
(
" -> "
::
Text
)
<>
(
iValue
input
)
<>
(
"iValue "
::
Text
)
<>
(
iValue
input
)
pure
$
iName
input
forM_
(
files
multipartData
)
$
\
file
->
do
--{-
_
<-
forM
(
files
multipartData
)
$
\
file
->
do
let
content
=
fdPayload
file
let
content
=
fdPayload
file
putStrLn
$
(
"Content of "
::
Text
)
<>
(
fdFileName
file
)
putStrLn
$
(
"XXX "
::
Text
)
<>
(
fdFileName
file
)
LBS
.
putStr
content
putStrLn
$
(
"YYY "
::
Text
)
<>
cs
content
return
0
--pure $ cs content
-- is <- inputs multipartData
--}
pure
$
Text
.
concat
$
map
cs
is
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
src/Gargantext/Database/Flow.hs
View file @
110c6265
...
@@ -59,7 +59,7 @@ import Gargantext.Ext.IMT (toSchoolName)
...
@@ -59,7 +59,7 @@ import Gargantext.Ext.IMT (toSchoolName)
import
Gargantext.Ext.IMTUser
(
deserialiseImtUsersFromFile
)
import
Gargantext.Ext.IMTUser
(
deserialiseImtUsersFromFile
)
import
Gargantext.Prelude
import
Gargantext.Prelude
import
Gargantext.Text.List
(
buildNgramsLists
,
StopSize
(
..
))
import
Gargantext.Text.List
(
buildNgramsLists
,
StopSize
(
..
))
import
Gargantext.Text.Parsers
(
parse
Docs
,
FileFormat
)
import
Gargantext.Text.Parsers
(
parse
File
,
FileFormat
)
import
Gargantext.Text.Terms
(
TermType
(
..
),
tt_lang
)
import
Gargantext.Text.Terms
(
TermType
(
..
),
tt_lang
)
import
Gargantext.Text.Terms
(
extractTerms
)
import
Gargantext.Text.Terms
(
extractTerms
)
import
Gargantext.Text.Terms.Mono.Stem.En
(
stemIt
)
import
Gargantext.Text.Terms.Mono.Stem.En
(
stemIt
)
...
@@ -114,7 +114,7 @@ flowCorpusFile :: FlowCmdM env ServantErr m
...
@@ -114,7 +114,7 @@ flowCorpusFile :: FlowCmdM env ServantErr m
flowCorpusFile
u
n
l
la
ff
fp
=
do
flowCorpusFile
u
n
l
la
ff
fp
=
do
docs
<-
liftIO
(
splitEvery
500
docs
<-
liftIO
(
splitEvery
500
<$>
take
l
<$>
take
l
<$>
parse
Docs
ff
fp
<$>
parse
File
ff
fp
)
)
flowCorpus
u
n
la
(
map
(
map
toHyperdataDocument
)
docs
)
flowCorpus
u
n
la
(
map
(
map
toHyperdataDocument
)
docs
)
...
...
src/Gargantext/Ext/IMT.hs
View file @
110c6265
...
@@ -101,7 +101,7 @@ mapIdSchool :: Map Text Text
...
@@ -101,7 +101,7 @@ mapIdSchool :: Map Text Text
mapIdSchool
=
M
.
fromList
$
Gargantext
.
Prelude
.
map
(
\
(
School
n
_
i
)
->
(
i
,
n
))
schools
mapIdSchool
=
M
.
fromList
$
Gargantext
.
Prelude
.
map
(
\
(
School
n
_
i
)
->
(
i
,
n
))
schools
hal_data
::
IO
(
DV
.
Vector
CsvHal
)
hal_data
::
IO
(
DV
.
Vector
CsvHal
)
hal_data
=
snd
<$>
CSV
.
readHal
"doc/corpus_imt/Gargantext_Corpus.csv"
hal_data
=
snd
<$>
CSV
.
read
Csv
Hal
"doc/corpus_imt/Gargantext_Corpus.csv"
names
::
S
.
Set
Text
names
::
S
.
Set
Text
names
=
S
.
fromList
$
Gargantext
.
Prelude
.
map
(
\
s
->
school_id
s
)
schools
names
=
S
.
fromList
$
Gargantext
.
Prelude
.
map
(
\
s
->
school_id
s
)
schools
...
...
src/Gargantext/Text/Convert.hs
View file @
110c6265
...
@@ -15,16 +15,18 @@ Format Converter.
...
@@ -15,16 +15,18 @@ Format Converter.
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Text.Convert
(
risPress2csv
)
module
Gargantext.Text.Convert
(
risPress2csv
Write
)
where
where
import
System.FilePath
(
FilePath
())
-- , takeExtension)
import
System.FilePath
(
FilePath
())
-- , takeExtension)
import
Gargantext.Prelude
import
Gargantext.Prelude
import
Gargantext.Text.Parsers.CSV
(
writeDocs2Csv
)
import
Gargantext.Text.Parsers.CSV
(
writeDocs2Csv
)
import
Gargantext.Text.Parsers
(
parse
Docs
,
FileFormat
(
..
))
import
Gargantext.Text.Parsers
(
parse
File
,
FileFormat
(
..
))
risPress2csv
::
FilePath
->
IO
()
risPress2csv
Write
::
FilePath
->
IO
()
risPress2csv
f
=
parseDocs
RisPresse
(
f
<>
".ris"
)
risPress2csv
Write
f
=
parseFile
RisPresse
(
f
<>
".ris"
)
>>=
\
hs
->
writeDocs2Csv
(
f
<>
".csv"
)
hs
>>=
\
hs
->
writeDocs2Csv
(
f
<>
".csv"
)
hs
src/Gargantext/Text/Parsers.hs
View file @
110c6265
...
@@ -22,9 +22,10 @@ please follow the types.
...
@@ -22,9 +22,10 @@ please follow the types.
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Text.Parsers
(
parse
,
FileFormat
(
..
),
clean
,
parseDocs
)
module
Gargantext.Text.Parsers
(
FileFormat
(
..
),
clean
,
parseFile
)
where
where
--import Data.ByteString (ByteString)
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Control.Monad
(
join
)
import
Control.Monad
(
join
)
...
@@ -39,20 +40,18 @@ import Data.String (String())
...
@@ -39,20 +40,18 @@ import Data.String (String())
import
Data.String
()
import
Data.String
()
import
Data.Text
(
Text
)
import
Data.Text
(
Text
)
import
Data.Text.Encoding
(
decodeUtf8
)
import
Data.Text.Encoding
(
decodeUtf8
)
import
Data.Time
(
UTCTime
(
..
))
import
Data.Tuple.Extra
(
both
,
first
,
second
)
import
Data.Tuple.Extra
(
both
,
first
,
second
)
import
System.FilePath
(
FilePath
(),
takeExtension
)
import
System.FilePath
(
FilePath
(),
takeExtension
)
import
qualified
Data.ByteString
as
DB
import
qualified
Data.ByteString
as
DB
import
qualified
Data.Map
as
DM
import
qualified
Data.Map
as
DM
import
qualified
Data.Text
as
DT
import
qualified
Data.Text
as
DT
import
qualified
Data.Time
as
DT
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Prelude
import
Gargantext.Prelude
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
qualified
Gargantext.Text.Parsers.WOS
as
WOS
import
qualified
Gargantext.Text.Parsers.WOS
as
WOS
import
qualified
Gargantext.Text.Parsers.RIS
as
RIS
import
qualified
Gargantext.Text.Parsers.RIS
as
RIS
import
Gargantext.Text.Parsers.RIS.Presse
(
presseEnrich
)
import
Gargantext.Text.Parsers.RIS.Presse
(
presseEnrich
)
import
Gargantext.Text.Parsers.Date
(
parseDate
)
import
qualified
Gargantext.Text.Parsers.Date
as
Date
import
Gargantext.Text.Parsers.CSV
(
parseHal
)
import
Gargantext.Text.Parsers.CSV
(
parseHal
)
import
Gargantext.Text.Terms.Stop
(
detectLang
)
import
Gargantext.Text.Terms.Stop
(
detectLang
)
------------------------------------------------------------------------
------------------------------------------------------------------------
...
@@ -77,33 +76,21 @@ data FileFormat = WOS | RIS | RisPresse
...
@@ -77,33 +76,21 @@ data FileFormat = WOS | RIS | RisPresse
-- | ODT -- Not Implemented / import Pandoc
-- | ODT -- Not Implemented / import Pandoc
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | XML -- Not Implemented / see :
-- | XML -- Not Implemented / see :
-- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
-- TODO: to debug maybe add the filepath in error message
{-
parseFormat :: FileFormat -> ByteString -> [HyperdataDocument]
parseFormat = undefined
-}
-- | Parse file into documents
-- | Parse file into documents
-- TODO manage errors here
-- TODO manage errors here
parseDocs
::
FileFormat
->
FilePath
->
IO
[
HyperdataDocument
]
-- TODO: to debug maybe add the filepath in error message
parseDocs
CsvHalFormat
p
=
parseHal
p
parseFile
::
FileFormat
->
FilePath
->
IO
[
HyperdataDocument
]
parseDocs
RisPresse
p
=
join
$
mapM
(
toDoc
RIS
)
<$>
snd
<$>
enrichWith
presseEnrich
<$>
parse'
RIS
p
parseFile
CsvHalFormat
p
=
parseHal
p
parseDocs
WOS
p
=
join
$
mapM
(
toDoc
WOS
)
<$>
snd
<$>
enrichWith
(
map
(
first
WOS
.
keys
))
<$>
parse'
WOS
p
parseFile
RisPresse
p
=
join
$
mapM
(
toDoc
RIS
)
<$>
snd
<$>
enrichWith
RisPresse
<$>
readFileWith
RIS
p
parseDocs
ff
p
=
join
$
mapM
(
toDoc
ff
)
<$>
snd
<$>
parse
ff
p
parseFile
WOS
p
=
join
$
mapM
(
toDoc
WOS
)
<$>
snd
<$>
enrichWith
WOS
<$>
readFileWith
WOS
p
parseFile
ff
p
=
join
$
mapM
(
toDoc
ff
)
<$>
snd
<$>
enrichWith
ff
<$>
readFileWith
ff
p
type
Year
=
Int
type
Month
=
Int
type
Day
=
Int
-- | Parse date to Ints
-- TODO add hours, minutes and seconds
parseDate'
::
Lang
->
Maybe
Text
->
IO
(
Maybe
UTCTime
,
(
Maybe
Year
,
Maybe
Month
,
Maybe
Day
))
parseDate'
_
Nothing
=
pure
(
Nothing
,
(
Nothing
,
Nothing
,
Nothing
))
parseDate'
l
(
Just
txt
)
=
do
utcTime
<-
parseDate
l
txt
let
(
UTCTime
day
_
)
=
utcTime
let
(
y
,
m
,
d
)
=
DT
.
toGregorian
day
pure
(
Just
utcTime
,
(
Just
(
fromIntegral
y
),
Just
m
,
Just
d
))
toDoc
::
FileFormat
->
[(
Text
,
Text
)]
->
IO
HyperdataDocument
toDoc
::
FileFormat
->
[(
Text
,
Text
)]
->
IO
HyperdataDocument
-- TODO use language for RIS
-- TODO use language for RIS
...
@@ -113,7 +100,7 @@ toDoc ff d = do
...
@@ -113,7 +100,7 @@ toDoc ff d = do
let
dateToParse
=
DT
.
replace
"-"
" "
<$>
lookup
"PY"
d
<>
Just
" "
<>
lookup
"publication_date"
d
let
dateToParse
=
DT
.
replace
"-"
" "
<$>
lookup
"PY"
d
<>
Just
" "
<>
lookup
"publication_date"
d
(
utcTime
,
(
pub_year
,
pub_month
,
pub_day
))
<-
parseDate'
lang
dateToParse
(
utcTime
,
(
pub_year
,
pub_month
,
pub_day
))
<-
Date
.
split
lang
dateToParse
pure
$
HyperdataDocument
(
Just
$
DT
.
pack
$
show
ff
)
pure
$
HyperdataDocument
(
Just
$
DT
.
pack
$
show
ff
)
(
lookup
"doi"
d
)
(
lookup
"doi"
d
)
...
@@ -135,26 +122,28 @@ toDoc ff d = do
...
@@ -135,26 +122,28 @@ toDoc ff d = do
Nothing
Nothing
(
Just
$
(
DT
.
pack
.
show
)
lang
)
(
Just
$
(
DT
.
pack
.
show
)
lang
)
parse
::
FileFormat
->
FilePath
->
IO
([
ParseError
],
[[(
Text
,
Text
)]])
enrichWith
::
FileFormat
parse
ff
fp
=
enrichWith
identity
<$>
parse'
ff
fp
->
(
a
,
[[[(
DB
.
ByteString
,
DB
.
ByteString
)]]])
->
(
a
,
[[(
Text
,
Text
)]])
enrichWith
RisPresse
=
enrichWith'
presseEnrich
enrichWith
WOS
=
enrichWith'
(
map
(
first
WOS
.
keys
))
enrichWith
_
=
enrichWith'
identity
enrichWith
::
([(
DB
.
ByteString
,
DB
.
ByteString
)]
->
[(
DB
.
ByteString
,
DB
.
ByteString
)])
enrichWith'
::
([(
DB
.
ByteString
,
DB
.
ByteString
)]
->
[(
DB
.
ByteString
,
DB
.
ByteString
)])
->
(
a
,
[[[(
DB
.
ByteString
,
DB
.
ByteString
)]]])
->
(
a
,
[[(
Text
,
Text
)]])
->
(
a
,
[[[(
DB
.
ByteString
,
DB
.
ByteString
)]]])
->
(
a
,
[[(
Text
,
Text
)]])
enrichWith
f
=
second
(
map
both'
.
map
f
.
concat
)
enrichWith
'
f
=
second
(
map
both'
.
map
f
.
concat
)
where
where
both'
=
map
(
both
decodeUtf8
)
both'
=
map
(
both
decodeUtf8
)
parse'
::
FileFormat
->
FilePath
readFileWith
::
FileFormat
->
FilePath
->
IO
([
ParseError
],
[[[(
DB
.
ByteString
,
DB
.
ByteString
)]]])
->
IO
([
ParseError
],
[[[(
DB
.
ByteString
,
DB
.
ByteString
)]]])
parse'
format
path
=
do
readFileWith
format
path
=
do
files
<-
case
takeExtension
path
of
files
<-
case
takeExtension
path
of
".zip"
->
openZip
path
".zip"
->
openZip
path
_
->
pure
<$>
clean
<$>
DB
.
readFile
path
_
->
pure
<$>
clean
<$>
DB
.
readFile
path
partitionEithers
<$>
mapConcurrently
(
runParser
format
)
files
partitionEithers
<$>
mapConcurrently
(
runParser
format
)
files
-- | withParser:
-- | withParser:
-- According to the format of the text, choose the right parser.
-- According to the format of the text, choose the right parser.
-- TODO withParser :: FileFormat -> Parser [Document]
-- TODO withParser :: FileFormat -> Parser [Document]
...
@@ -181,5 +170,3 @@ clean txt = DBC.map clean' txt
...
@@ -181,5 +170,3 @@ clean txt = DBC.map clean' txt
clean'
'’'
=
'
\'
'
clean'
'’'
=
'
\'
'
clean'
'
\r
'
=
' '
clean'
'
\r
'
=
' '
clean'
c
=
c
clean'
c
=
c
src/Gargantext/Text/Parsers/CSV.hs
View file @
110c6265
...
@@ -17,25 +17,23 @@ CSV parser for Gargantext corpus files.
...
@@ -17,25 +17,23 @@ CSV parser for Gargantext corpus files.
module
Gargantext.Text.Parsers.CSV
where
module
Gargantext.Text.Parsers.CSV
where
import
GHC.Real
(
round
)
import
GHC.IO
(
FilePath
)
import
Control.Applicative
import
Control.Applicative
import
Data.Char
(
ord
)
import
Data.Char
(
ord
)
import
Data.Csv
import
Data.Csv
import
Data.Either
(
Either
(
Left
,
Right
))
import
Data.Either
(
Either
(
Left
,
Right
))
import
Data.Text
(
Text
,
pack
,
length
,
intercalate
)
import
Data.Text
(
Text
,
pack
,
length
,
intercalate
)
import
qualified
Data.ByteString.Lazy
as
BL
import
Data.Time.Segment
(
jour
)
import
Data.Time.Segment
(
jour
)
import
Data.Vector
(
Vector
)
import
Data.Vector
(
Vector
)
import
qualified
Data.Vector
as
V
import
GHC.IO
(
FilePath
)
import
GHC.Real
(
round
)
import
GHC.Word
(
Word8
)
import
Gargantext.Database.Types.Node
-- (HyperdataDocument(..))
import
Gargantext.Database.Types.Node
-- (HyperdataDocument(..))
import
Gargantext.Prelude
hiding
(
length
)
import
Gargantext.Text
import
Gargantext.Text
import
Gargantext.Text.Context
import
Gargantext.Text.Context
import
Gargantext.Prelude
hiding
(
length
)
import
qualified
Data.ByteString.Lazy
as
BL
import
qualified
Data.ByteString
as
BS
import
qualified
Data.Vector
as
V
---------------------------------------------------------------
---------------------------------------------------------------
headerCsvGargV3
::
Header
headerCsvGargV3
::
Header
...
@@ -193,45 +191,73 @@ hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
...
@@ -193,45 +191,73 @@ hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
csvDecodeOptions
::
DecodeOptions
csvDecodeOptions
::
DecodeOptions
csvDecodeOptions
=
(
defaultDecodeOptions
csvDecodeOptions
=
defaultDecodeOptions
{
decDelimiter
=
delimiter
}
{
decDelimiter
=
fromIntegral
$
ord
'
\t
'
}
)
csvEncodeOptions
::
EncodeOptions
csvEncodeOptions
::
EncodeOptions
csvEncodeOptions
=
(
defaultEncodeOptions
csvEncodeOptions
=
defaultEncodeOptions
{
encDelimiter
=
delimiter
}
{
encDelimiter
=
fromIntegral
$
ord
'
\t
'
}
)
delimiter
::
Word8
delimiter
=
fromIntegral
$
ord
'
\t
'
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsvOn
::
[
CsvDoc
->
Text
]
->
FilePath
->
IO
[
Text
]
readCsvOn
::
[
CsvDoc
->
Text
]
->
FilePath
->
IO
[
Text
]
readCsvOn
fields
fp
=
V
.
toList
<$>
V
.
map
(
\
l
->
intercalate
(
pack
" "
)
$
map
(
\
field
->
field
l
)
fields
)
readCsvOn
fields
fp
=
V
.
toList
<$>
V
.
map
(
\
l
->
intercalate
(
pack
" "
)
$
map
(
\
field
->
field
l
)
fields
)
<$>
snd
<$>
snd
<$>
read
Csv
fp
<$>
read
File
fp
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsv
::
FilePath
->
IO
(
Header
,
Vector
CsvDoc
)
readCsv
fp
=
do
readFileLazy
::
(
FromNamedRecord
a
)
=>
a
->
FilePath
->
IO
(
Header
,
Vector
a
)
csvData
<-
BL
.
readFile
fp
readFileLazy
f
=
fmap
(
readByteStringLazy
f
)
.
BL
.
readFile
case
decodeByNameWith
csvDecodeOptions
csvData
of
readFileStrict
::
(
FromNamedRecord
a
)
=>
a
->
FilePath
->
IO
(
Header
,
Vector
a
)
readFileStrict
f
=
fmap
(
readByteStringStrict
f
)
.
BS
.
readFile
readByteStringLazy
::
(
FromNamedRecord
a
)
=>
a
->
BL
.
ByteString
->
(
Header
,
Vector
a
)
readByteStringLazy
f
bs
=
case
decodeByNameWith
csvDecodeOptions
bs
of
Left
e
->
panic
(
pack
e
)
Right
csvDocs
->
csvDocs
readByteStringStrict
::
(
FromNamedRecord
a
)
=>
a
->
BS
.
ByteString
->
(
Header
,
Vector
a
)
readByteStringStrict
ff
=
(
readByteStringLazy
ff
)
.
BL
.
fromStrict
------------------------------------------------------------------------
-- | TODO use readFileLazy
readFile
::
FilePath
->
IO
(
Header
,
Vector
CsvDoc
)
readFile
=
fmap
readCsvLazyBS
.
BL
.
readFile
-- | TODO use readByteStringLazy
readCsvLazyBS
::
BL
.
ByteString
->
(
Header
,
Vector
CsvDoc
)
readCsvLazyBS
bs
=
case
decodeByNameWith
csvDecodeOptions
bs
of
Left
e
->
panic
(
pack
e
)
Left
e
->
panic
(
pack
e
)
Right
csvDocs
->
pure
csvDocs
Right
csvDocs
->
csvDocs
------------------------------------------------------------------------
-- | TODO use readFileLazy
readCsvHal
::
FilePath
->
IO
(
Header
,
Vector
CsvHal
)
readCsvHal
=
fmap
readCsvHalLazyBS
.
BL
.
readFile
readHal
::
FilePath
->
IO
(
Header
,
Vector
CsvHal
)
-- | TODO use readByteStringLazy
readHal
fp
=
do
readCsvHalLazyBS
::
BL
.
ByteString
->
(
Header
,
Vector
CsvHal
)
csvData
<-
BL
.
readFile
fp
readCsvHalLazyBS
bs
=
case
decodeByNameWith
csvDecodeOptions
bs
of
case
decodeByNameWith
csvDecodeOptions
csvData
of
Left
e
->
panic
(
pack
e
)
Left
e
->
panic
(
pack
e
)
Right
csvDocs
->
pure
csvDocs
Right
csvDocs
->
csvDocs
readCsvHalBSStrict
::
BS
.
ByteString
->
(
Header
,
Vector
CsvHal
)
readCsvHalBSStrict
=
readCsvHalLazyBS
.
BL
.
fromStrict
------------------------------------------------------------------------
------------------------------------------------------------------------
write
Csv
::
FilePath
->
(
Header
,
Vector
CsvDoc
)
->
IO
()
write
File
::
FilePath
->
(
Header
,
Vector
CsvDoc
)
->
IO
()
write
Csv
fp
(
h
,
vs
)
=
BL
.
writeFile
fp
$
write
File
fp
(
h
,
vs
)
=
BL
.
writeFile
fp
$
encodeByNameWith
csvEncodeOptions
h
(
V
.
toList
vs
)
encodeByNameWith
csvEncodeOptions
h
(
V
.
toList
vs
)
writeDocs2Csv
::
FilePath
->
[
HyperdataDocument
]
->
IO
()
writeDocs2Csv
::
FilePath
->
[
HyperdataDocument
]
->
IO
()
writeDocs2Csv
fp
hs
=
BL
.
writeFile
fp
$
writeDocs2Csv
fp
hs
=
BL
.
writeFile
fp
$
hyperdataDocument2csv
hs
encodeByNameWith
csvEncodeOptions
headerCsvGargV3
(
map
hyperdataDocument2csvDoc
hs
)
hyperdataDocument2csv
::
[
HyperdataDocument
]
->
BL
.
ByteString
hyperdataDocument2csv
hs
=
encodeByNameWith
csvEncodeOptions
headerCsvGargV3
(
map
hyperdataDocument2csvDoc
hs
)
------------------------------------------------------------------------
------------------------------------------------------------------------
-- Hal Format
-- Hal Format
data
CsvHal
=
CsvHal
data
CsvHal
=
CsvHal
...
@@ -340,7 +366,6 @@ csvHal2doc (CsvHal title source
...
@@ -340,7 +366,6 @@ csvHal2doc (CsvHal title source
------------------------------------------------------------------------
------------------------------------------------------------------------
parseHal
::
FilePath
->
IO
[
HyperdataDocument
]
parseHal
::
FilePath
->
IO
[
HyperdataDocument
]
parseHal
fp
=
map
csvHal2doc
<$>
V
.
toList
<$>
snd
<$>
readHal
fp
parseHal
fp
=
map
csvHal2doc
<$>
V
.
toList
<$>
snd
<$>
read
Csv
Hal
fp
------------------------------------------------------------------------
------------------------------------------------------------------------
src/Gargantext/Text/Parsers/Date.hs
View file @
110c6265
...
@@ -18,12 +18,12 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
...
@@ -18,12 +18,12 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Text.Parsers.Date
(
parse
Date
,
parseDateRaw
)
where
module
Gargantext.Text.Parsers.Date
(
parse
,
parseRaw
,
split
)
where
import
Data.HashMap.Strict
as
HM
hiding
(
map
)
import
Data.HashMap.Strict
as
HM
hiding
(
map
)
import
Data.Text
(
Text
,
unpack
,
splitOn
,
pack
)
import
Data.Text
(
Text
,
unpack
,
splitOn
,
pack
)
import
Data.Time
(
parseTimeOrError
,
defaultTimeLocale
)
import
Data.Time
(
parseTimeOrError
,
defaultTimeLocale
,
toGregorian
)
import
Data.Time.Clock
(
UTCTime
,
getCurrentTime
)
import
Data.Time.Clock
(
UTCTime
(
..
)
,
getCurrentTime
)
import
Data.Time.LocalTime
(
utc
)
import
Data.Time.LocalTime
(
utc
)
import
Data.Time.LocalTime.TimeZone.Series
(
zonedTimeToZoneSeriesTime
)
import
Data.Time.LocalTime.TimeZone.Series
(
zonedTimeToZoneSeriesTime
)
import
Duckling.Api
(
analyze
)
import
Duckling.Api
(
analyze
)
...
@@ -38,21 +38,41 @@ import qualified Data.HashSet as HashSet
...
@@ -38,21 +38,41 @@ import qualified Data.HashSet as HashSet
import
qualified
Duckling.Core
as
DC
import
qualified
Duckling.Core
as
DC
------------------------------------------------------------------------
------------------------------------------------------------------------
-- | Parse date to Ints
-- TODO add hours, minutes and seconds
split
::
Lang
->
Maybe
Text
->
IO
(
Maybe
UTCTime
,
(
Maybe
Year
,
Maybe
Month
,
Maybe
Day
))
split
_
Nothing
=
pure
(
Nothing
,
(
Nothing
,
Nothing
,
Nothing
))
split
l
(
Just
txt
)
=
do
utcTime
<-
parse
l
txt
let
(
y
,
m
,
d
)
=
split'
utcTime
pure
(
Just
utcTime
,
(
Just
y
,
Just
m
,
Just
d
))
split'
::
UTCTime
->
(
Year
,
Month
,
Day
)
split'
utcTime
=
(
fromIntegral
y
,
m
,
d
)
where
(
UTCTime
day
_
)
=
utcTime
(
y
,
m
,
d
)
=
toGregorian
day
type
Year
=
Int
type
Month
=
Int
type
Day
=
Int
------------------------------------------------------------------------
-- | Date Parser
-- | Date Parser
-- Parses dates mentions in full text given the language.
-- Parses dates mentions in full text given the language.
-- >>> parseDate FR (pack "10 avril 1979 à 19H")
-- >>> parseDate FR (pack "10 avril 1979 à 19H")
-- 1979-04-10 19:00:00 UTC
-- 1979-04-10 19:00:00 UTC
-- >>> parseDate EN (pack "April 10 1979")
-- >>> parseDate EN (pack "April 10 1979")
-- 1979-04-10 00:00:00 UTC
-- 1979-04-10 00:00:00 UTC
parse
Date
::
Lang
->
Text
->
IO
UTCTime
parse
::
Lang
->
Text
->
IO
UTCTime
parse
Date
lang
s
=
parseDate'
"%Y-%m-%dT%T"
"0-0-0T0:0:0"
lang
s
parse
lang
s
=
parseDate'
"%Y-%m-%dT%T"
"0-0-0T0:0:0"
lang
s
type
DateFormat
=
Text
type
DateFormat
=
Text
type
DateDefault
=
Text
type
DateDefault
=
Text
parseDate'
::
DateFormat
->
DateDefault
->
Lang
->
Text
->
IO
UTCTime
parseDate'
::
DateFormat
->
DateDefault
->
Lang
->
Text
->
IO
UTCTime
parseDate'
format
def
lang
s
=
do
parseDate'
format
def
lang
s
=
do
dateStr'
<-
parse
Date
Raw
lang
s
dateStr'
<-
parseRaw
lang
s
let
dateStr
=
unpack
$
maybe
def
identity
let
dateStr
=
unpack
$
maybe
def
identity
$
head
$
splitOn
"."
dateStr'
$
head
$
splitOn
"."
dateStr'
pure
$
parseTimeOrError
True
defaultTimeLocale
(
unpack
format
)
dateStr
pure
$
parseTimeOrError
True
defaultTimeLocale
(
unpack
format
)
dateStr
...
@@ -70,19 +90,19 @@ parserLang EN = DC.EN
...
@@ -70,19 +90,19 @@ parserLang EN = DC.EN
-- IO can be avoided here:
-- IO can be avoided here:
-- currentContext :: Lang -> IO Context
-- currentContext :: Lang -> IO Context
-- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
-- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
-- parse
Date
Raw :: Context -> Text -> SomeErrorHandling Text
-- parseRaw :: Context -> Text -> SomeErrorHandling Text
-- TODO error handling
-- TODO error handling
parse
Date
Raw
::
Lang
->
Text
->
IO
(
Text
)
parseRaw
::
Lang
->
Text
->
IO
(
Text
)
parse
Date
Raw
lang
text
=
do
parseRaw
lang
text
=
do
maybeJson
<-
map
jsonValue
<$>
parseDateWithDuckling
lang
text
maybeJson
<-
map
jsonValue
<$>
parseDateWithDuckling
lang
text
case
headMay
maybeJson
of
case
headMay
maybeJson
of
Just
(
Json
.
Object
object
)
->
case
HM
.
lookup
"value"
object
of
Just
(
Json
.
Object
object
)
->
case
HM
.
lookup
"value"
object
of
Just
(
Json
.
String
date
)
->
pure
date
Just
(
Json
.
String
date
)
->
pure
date
Just
_
->
panic
"Parse
Date
Raw ERROR: should be a json String"
Just
_
->
panic
"ParseRaw ERROR: should be a json String"
Nothing
->
panic
$
"Parse
Date
Raw ERROR: no date found"
<>
(
pack
.
show
)
lang
<>
" "
<>
text
Nothing
->
panic
$
"ParseRaw ERROR: no date found"
<>
(
pack
.
show
)
lang
<>
" "
<>
text
_
->
panic
$
"Parse
Date
Raw ERROR: type error"
<>
(
pack
.
show
)
lang
<>
" "
<>
text
_
->
panic
$
"ParseRaw ERROR: type error"
<>
(
pack
.
show
)
lang
<>
" "
<>
text
-- | Current Time in DucklingTime format
-- | Current Time in DucklingTime format
...
...
src/Gargantext/Text/Parsers/Json2Csv.hs
View file @
110c6265
...
@@ -28,7 +28,7 @@ import Data.Text (Text, unpack)
...
@@ -28,7 +28,7 @@ import Data.Text (Text, unpack)
import
Gargantext.Core.Utils.Prefix
(
unPrefix
)
import
Gargantext.Core.Utils.Prefix
(
unPrefix
)
import
Gargantext.Prelude
import
Gargantext.Prelude
import
System.IO
(
FilePath
)
import
System.IO
(
FilePath
)
import
Gargantext.Text.Parsers.CSV
(
CsvDoc
(
..
),
write
Csv
,
headerCsvGargV3
)
import
Gargantext.Text.Parsers.CSV
(
CsvDoc
(
..
),
write
File
,
headerCsvGargV3
)
import
Data.Vector
(
fromList
)
import
Data.Vector
(
fromList
)
data
Patent
=
Patent
{
_patent_title
::
Text
data
Patent
=
Patent
{
_patent_title
::
Text
...
@@ -48,7 +48,7 @@ type FilePathOut = FilePath
...
@@ -48,7 +48,7 @@ type FilePathOut = FilePath
json2csv
::
FilePathIn
->
FilePathOut
->
IO
()
json2csv
::
FilePathIn
->
FilePathOut
->
IO
()
json2csv
fin
fout
=
do
json2csv
fin
fout
=
do
patents
<-
maybe
(
panic
"json2csv error"
)
identity
<$>
readPatents
fin
patents
<-
maybe
(
panic
"json2csv error"
)
identity
<$>
readPatents
fin
write
Csv
fout
(
headerCsvGargV3
,
fromList
$
map
patent2csvDoc
patents
)
write
File
fout
(
headerCsvGargV3
,
fromList
$
map
patent2csvDoc
patents
)
patent2csvDoc
::
Patent
->
CsvDoc
patent2csvDoc
::
Patent
->
CsvDoc
patent2csvDoc
(
Patent
title
abstract
year
_
)
=
patent2csvDoc
(
Patent
title
abstract
year
_
)
=
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment