Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
haskell-gargantext
Commits
110c6265
Commit
110c6265
authored
May 16, 2019
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'dev-csv' into dev
parents
a7ed762a
68365f40
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
165 additions
and
114 deletions
+165
-114
CleanCsvCorpus.hs
bin/gargantext-cli/CleanCsvCorpus.hs
+9
-10
package.yaml
package.yaml
+1
-0
Upload.hs
src/Gargantext/API/Upload.hs
+30
-13
Flow.hs
src/Gargantext/Database/Flow.hs
+2
-2
IMT.hs
src/Gargantext/Ext/IMT.hs
+1
-1
Convert.hs
src/Gargantext/Text/Convert.hs
+6
-4
Parsers.hs
src/Gargantext/Text/Parsers.hs
+25
-38
CSV.hs
src/Gargantext/Text/Parsers/CSV.hs
+57
-32
Date.hs
src/Gargantext/Text/Parsers/Date.hs
+32
-12
Json2Csv.hs
src/Gargantext/Text/Parsers/Json2Csv.hs
+2
-2
No files found.
bin/gargantext-cli/CleanCsvCorpus.hs
View file @
110c6265
...
...
@@ -24,14 +24,13 @@ import qualified Data.Vector as V
import
Gargantext.Prelude
import
Gargantext.Text.Search
import
Gargantext.Text.Parsers.CSV
import
qualified
Gargantext.Text.Parsers.CSV
as
CSV
------------------------------------------------------------------------
type
Query
=
[
S
.
Term
]
filterDocs
::
[
DocId
]
->
Vector
Doc
->
Vector
Doc
filterDocs
docIds
=
V
.
filter
(
\
doc
->
S
.
member
(
d_docId
doc
)
$
S
.
fromList
docIds
)
filterDocs
::
[
DocId
]
->
Vector
CSV
.
Doc
->
Vector
CSV
.
Doc
filterDocs
docIds
=
V
.
filter
(
\
doc
->
S
.
member
(
CSV
.
d_docId
doc
)
$
S
.
fromList
docIds
)
main
::
IO
()
...
...
@@ -41,17 +40,17 @@ main = do
--let q = ["water", "scarcity", "morocco", "shortage","flood"]
let
q
=
[
"gratuit"
,
"gratuité"
,
"culture"
,
"culturel"
]
(
h
,
csvDocs
)
<-
readCsv
rPath
(
h
,
csvDocs
)
<-
CSV
.
readFile
rPath
putStrLn
$
"Number of documents before:"
<>
show
(
V
.
length
csvDocs
)
putStrLn
$
"Mean size of docs:"
<>
show
(
docsSize
csvDocs
)
putStrLn
$
"Mean size of docs:"
<>
show
(
CSV
.
docsSize
csvDocs
)
let
docs
=
toDocs
csvDocs
let
docs
=
CSV
.
toDocs
csvDocs
let
engine
=
insertDocs
docs
initialDocSearchEngine
let
docIds
=
S
.
query
engine
(
map
pack
q
)
let
docs'
=
fromDocs
$
filterDocs
docIds
(
V
.
fromList
docs
)
let
docs'
=
CSV
.
fromDocs
$
filterDocs
docIds
(
V
.
fromList
docs
)
putStrLn
$
"Number of documents after:"
<>
show
(
V
.
length
docs'
)
putStrLn
$
"Mean size of docs:"
<>
show
(
docsSize
docs'
)
putStrLn
$
"Mean size of docs:"
<>
show
(
CSV
.
docsSize
docs'
)
writeCsv
wPath
(
h
,
docs'
)
CSV
.
writeFile
wPath
(
h
,
docs'
)
package.yaml
View file @
110c6265
...
...
@@ -162,6 +162,7 @@ library:
-
servant-swagger
-
servant-swagger-ui
-
servant-static-th
-
servant-cassava
-
serialise
-
split
-
stemmer
...
...
src/Gargantext/API/Upload.hs
View file @
110c6265
...
...
@@ -25,17 +25,20 @@ Portability : POSIX
module
Gargantext.API.Upload
where
import
qualified
Data.Text
as
Text
import
GHC.Generics
(
Generic
)
import
Gargantext.Prelude
import
Data.Text
(
Text
)
import
Data.Aeson
import
Servant
import
Servant.Multipart
--import Servant.Mock (HasMock(mock))
import
Servant.Swagger
(
HasSwagger
(
toSwagger
))
import
qualified
Data.ByteString.Lazy
as
LBS
--
import qualified Data.ByteString.Lazy as LBS
import
Control.Monad
import
Control.Monad.IO.Class
import
Gargantext.API.Types
--import Servant.CSV.Cassava (CSV'(..))
--import Test.QuickCheck.Arbitrary (Arbitrary, arbitrary)
--import Data.Swagger
--import Gargantext.API.Ngrams (TODO)
...
...
@@ -67,22 +70,36 @@ instance HasMock (MultipartForm Mem (MultipartData Mem) :> sub) context where
mock _ _ = undefined
-}
type
ApiUpload
=
MultipartForm
Mem
(
MultipartData
Mem
)
:>
Post
'[
J
SON
]
Integer
data
Upload
=
Upload
{
up
::
[
Text
]
}
deriving
(
Generic
)
instance
ToJSON
Upload
type
ApiUpload
=
MultipartForm
Mem
(
MultipartData
Mem
)
:>
Post
'[
J
SON
]
Text
-- MultipartData consists in textual inputs,
-- accessible through its "inputs" field, as well
-- as files, accessible through its "files" field.
upload
::
GargServer
ApiUpload
upload
multipartData
=
do
liftIO
$
do
--{-
is
<-
liftIO
$
do
putStrLn
(
"Inputs:"
::
Text
)
forM_
(
inputs
multipartData
)
$
\
input
->
putStrLn
$
(
" "
::
Text
)
<>
(
iName
input
)
<>
(
" -> "
::
Text
)
<>
(
iValue
input
)
forM_
(
files
multipartData
)
$
\
file
->
do
let
content
=
fdPayload
file
putStrLn
$
(
"Content of "
::
Text
)
<>
(
fdFileName
file
)
LBS
.
putStr
content
return
0
forM
(
inputs
multipartData
)
$
\
input
->
do
putStrLn
$
(
"iName "
::
Text
)
<>
(
iName
input
)
<>
(
"iValue "
::
Text
)
<>
(
iValue
input
)
pure
$
iName
input
--{-
_
<-
forM
(
files
multipartData
)
$
\
file
->
do
let
content
=
fdPayload
file
putStrLn
$
(
"XXX "
::
Text
)
<>
(
fdFileName
file
)
putStrLn
$
(
"YYY "
::
Text
)
<>
cs
content
--pure $ cs content
-- is <- inputs multipartData
--}
pure
$
Text
.
concat
$
map
cs
is
-------------------------------------------------------------------------------
src/Gargantext/Database/Flow.hs
View file @
110c6265
...
...
@@ -59,7 +59,7 @@ import Gargantext.Ext.IMT (toSchoolName)
import
Gargantext.Ext.IMTUser
(
deserialiseImtUsersFromFile
)
import
Gargantext.Prelude
import
Gargantext.Text.List
(
buildNgramsLists
,
StopSize
(
..
))
import
Gargantext.Text.Parsers
(
parse
Docs
,
FileFormat
)
import
Gargantext.Text.Parsers
(
parse
File
,
FileFormat
)
import
Gargantext.Text.Terms
(
TermType
(
..
),
tt_lang
)
import
Gargantext.Text.Terms
(
extractTerms
)
import
Gargantext.Text.Terms.Mono.Stem.En
(
stemIt
)
...
...
@@ -114,7 +114,7 @@ flowCorpusFile :: FlowCmdM env ServantErr m
flowCorpusFile
u
n
l
la
ff
fp
=
do
docs
<-
liftIO
(
splitEvery
500
<$>
take
l
<$>
parse
Docs
ff
fp
<$>
parse
File
ff
fp
)
flowCorpus
u
n
la
(
map
(
map
toHyperdataDocument
)
docs
)
...
...
src/Gargantext/Ext/IMT.hs
View file @
110c6265
...
...
@@ -101,7 +101,7 @@ mapIdSchool :: Map Text Text
mapIdSchool
=
M
.
fromList
$
Gargantext
.
Prelude
.
map
(
\
(
School
n
_
i
)
->
(
i
,
n
))
schools
hal_data
::
IO
(
DV
.
Vector
CsvHal
)
hal_data
=
snd
<$>
CSV
.
readHal
"doc/corpus_imt/Gargantext_Corpus.csv"
hal_data
=
snd
<$>
CSV
.
read
Csv
Hal
"doc/corpus_imt/Gargantext_Corpus.csv"
names
::
S
.
Set
Text
names
=
S
.
fromList
$
Gargantext
.
Prelude
.
map
(
\
s
->
school_id
s
)
schools
...
...
src/Gargantext/Text/Convert.hs
View file @
110c6265
...
...
@@ -15,16 +15,18 @@ Format Converter.
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Text.Convert
(
risPress2csv
)
module
Gargantext.Text.Convert
(
risPress2csv
Write
)
where
import
System.FilePath
(
FilePath
())
-- , takeExtension)
import
Gargantext.Prelude
import
Gargantext.Text.Parsers.CSV
(
writeDocs2Csv
)
import
Gargantext.Text.Parsers
(
parse
Docs
,
FileFormat
(
..
))
import
Gargantext.Text.Parsers
(
parse
File
,
FileFormat
(
..
))
risPress2csv
::
FilePath
->
IO
()
risPress2csv
f
=
parseDocs
RisPresse
(
f
<>
".ris"
)
risPress2csv
Write
::
FilePath
->
IO
()
risPress2csv
Write
f
=
parseFile
RisPresse
(
f
<>
".ris"
)
>>=
\
hs
->
writeDocs2Csv
(
f
<>
".csv"
)
hs
src/Gargantext/Text/Parsers.hs
View file @
110c6265
...
...
@@ -22,9 +22,10 @@ please follow the types.
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Text.Parsers
(
parse
,
FileFormat
(
..
),
clean
,
parseDocs
)
module
Gargantext.Text.Parsers
(
FileFormat
(
..
),
clean
,
parseFile
)
where
--import Data.ByteString (ByteString)
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Control.Monad
(
join
)
...
...
@@ -39,20 +40,18 @@ import Data.String (String())
import
Data.String
()
import
Data.Text
(
Text
)
import
Data.Text.Encoding
(
decodeUtf8
)
import
Data.Time
(
UTCTime
(
..
))
import
Data.Tuple.Extra
(
both
,
first
,
second
)
import
System.FilePath
(
FilePath
(),
takeExtension
)
import
qualified
Data.ByteString
as
DB
import
qualified
Data.Map
as
DM
import
qualified
Data.Text
as
DT
import
qualified
Data.Time
as
DT
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Prelude
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
qualified
Gargantext.Text.Parsers.WOS
as
WOS
import
qualified
Gargantext.Text.Parsers.RIS
as
RIS
import
Gargantext.Text.Parsers.RIS.Presse
(
presseEnrich
)
import
Gargantext.Text.Parsers.Date
(
parseDate
)
import
qualified
Gargantext.Text.Parsers.Date
as
Date
import
Gargantext.Text.Parsers.CSV
(
parseHal
)
import
Gargantext.Text.Terms.Stop
(
detectLang
)
------------------------------------------------------------------------
...
...
@@ -77,33 +76,21 @@ data FileFormat = WOS | RIS | RisPresse
-- | ODT -- Not Implemented / import Pandoc
-- | PDF -- Not Implemented / pdftotext and import Pandoc ?
-- | XML -- Not Implemented / see :
-- -- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
-- TODO: to debug maybe add the filepath in error message
{-
parseFormat :: FileFormat -> ByteString -> [HyperdataDocument]
parseFormat = undefined
-}
-- | Parse file into documents
-- TODO manage errors here
parseDocs
::
FileFormat
->
FilePath
->
IO
[
HyperdataDocument
]
parseDocs
CsvHalFormat
p
=
parseHal
p
parseDocs
RisPresse
p
=
join
$
mapM
(
toDoc
RIS
)
<$>
snd
<$>
enrichWith
presseEnrich
<$>
parse'
RIS
p
parseDocs
WOS
p
=
join
$
mapM
(
toDoc
WOS
)
<$>
snd
<$>
enrichWith
(
map
(
first
WOS
.
keys
))
<$>
parse'
WOS
p
parseDocs
ff
p
=
join
$
mapM
(
toDoc
ff
)
<$>
snd
<$>
parse
ff
p
type
Year
=
Int
type
Month
=
Int
type
Day
=
Int
-- | Parse date to Ints
-- TODO add hours, minutes and seconds
parseDate'
::
Lang
->
Maybe
Text
->
IO
(
Maybe
UTCTime
,
(
Maybe
Year
,
Maybe
Month
,
Maybe
Day
))
parseDate'
_
Nothing
=
pure
(
Nothing
,
(
Nothing
,
Nothing
,
Nothing
))
parseDate'
l
(
Just
txt
)
=
do
utcTime
<-
parseDate
l
txt
let
(
UTCTime
day
_
)
=
utcTime
let
(
y
,
m
,
d
)
=
DT
.
toGregorian
day
pure
(
Just
utcTime
,
(
Just
(
fromIntegral
y
),
Just
m
,
Just
d
))
-- TODO: to debug maybe add the filepath in error message
parseFile
::
FileFormat
->
FilePath
->
IO
[
HyperdataDocument
]
parseFile
CsvHalFormat
p
=
parseHal
p
parseFile
RisPresse
p
=
join
$
mapM
(
toDoc
RIS
)
<$>
snd
<$>
enrichWith
RisPresse
<$>
readFileWith
RIS
p
parseFile
WOS
p
=
join
$
mapM
(
toDoc
WOS
)
<$>
snd
<$>
enrichWith
WOS
<$>
readFileWith
WOS
p
parseFile
ff
p
=
join
$
mapM
(
toDoc
ff
)
<$>
snd
<$>
enrichWith
ff
<$>
readFileWith
ff
p
toDoc
::
FileFormat
->
[(
Text
,
Text
)]
->
IO
HyperdataDocument
-- TODO use language for RIS
...
...
@@ -113,7 +100,7 @@ toDoc ff d = do
let
dateToParse
=
DT
.
replace
"-"
" "
<$>
lookup
"PY"
d
<>
Just
" "
<>
lookup
"publication_date"
d
(
utcTime
,
(
pub_year
,
pub_month
,
pub_day
))
<-
parseDate'
lang
dateToParse
(
utcTime
,
(
pub_year
,
pub_month
,
pub_day
))
<-
Date
.
split
lang
dateToParse
pure
$
HyperdataDocument
(
Just
$
DT
.
pack
$
show
ff
)
(
lookup
"doi"
d
)
...
...
@@ -135,26 +122,28 @@ toDoc ff d = do
Nothing
(
Just
$
(
DT
.
pack
.
show
)
lang
)
parse
::
FileFormat
->
FilePath
->
IO
([
ParseError
],
[[(
Text
,
Text
)]])
parse
ff
fp
=
enrichWith
identity
<$>
parse'
ff
fp
enrichWith
::
FileFormat
->
(
a
,
[[[(
DB
.
ByteString
,
DB
.
ByteString
)]]])
->
(
a
,
[[(
Text
,
Text
)]])
enrichWith
RisPresse
=
enrichWith'
presseEnrich
enrichWith
WOS
=
enrichWith'
(
map
(
first
WOS
.
keys
))
enrichWith
_
=
enrichWith'
identity
enrichWith
::
([(
DB
.
ByteString
,
DB
.
ByteString
)]
->
[(
DB
.
ByteString
,
DB
.
ByteString
)])
->
(
a
,
[[[(
DB
.
ByteString
,
DB
.
ByteString
)]]])
->
(
a
,
[[(
Text
,
Text
)]])
enrichWith
f
=
second
(
map
both'
.
map
f
.
concat
)
enrichWith'
::
([(
DB
.
ByteString
,
DB
.
ByteString
)]
->
[(
DB
.
ByteString
,
DB
.
ByteString
)])
->
(
a
,
[[[(
DB
.
ByteString
,
DB
.
ByteString
)]]])
->
(
a
,
[[(
Text
,
Text
)]])
enrichWith
'
f
=
second
(
map
both'
.
map
f
.
concat
)
where
both'
=
map
(
both
decodeUtf8
)
parse'
::
FileFormat
->
FilePath
readFileWith
::
FileFormat
->
FilePath
->
IO
([
ParseError
],
[[[(
DB
.
ByteString
,
DB
.
ByteString
)]]])
parse'
format
path
=
do
readFileWith
format
path
=
do
files
<-
case
takeExtension
path
of
".zip"
->
openZip
path
_
->
pure
<$>
clean
<$>
DB
.
readFile
path
partitionEithers
<$>
mapConcurrently
(
runParser
format
)
files
-- | withParser:
-- According to the format of the text, choose the right parser.
-- TODO withParser :: FileFormat -> Parser [Document]
...
...
@@ -181,5 +170,3 @@ clean txt = DBC.map clean' txt
clean'
'’'
=
'
\'
'
clean'
'
\r
'
=
' '
clean'
c
=
c
src/Gargantext/Text/Parsers/CSV.hs
View file @
110c6265
...
...
@@ -17,25 +17,23 @@ CSV parser for Gargantext corpus files.
module
Gargantext.Text.Parsers.CSV
where
import
GHC.Real
(
round
)
import
GHC.IO
(
FilePath
)
import
Control.Applicative
import
Data.Char
(
ord
)
import
Data.Csv
import
Data.Either
(
Either
(
Left
,
Right
))
import
Data.Text
(
Text
,
pack
,
length
,
intercalate
)
import
qualified
Data.ByteString.Lazy
as
BL
import
Data.Time.Segment
(
jour
)
import
Data.Vector
(
Vector
)
import
qualified
Data.Vector
as
V
import
GHC.IO
(
FilePath
)
import
GHC.Real
(
round
)
import
GHC.Word
(
Word8
)
import
Gargantext.Database.Types.Node
-- (HyperdataDocument(..))
import
Gargantext.Prelude
hiding
(
length
)
import
Gargantext.Text
import
Gargantext.Text.Context
import
Gargantext.Prelude
hiding
(
length
)
import
qualified
Data.ByteString.Lazy
as
BL
import
qualified
Data.ByteString
as
BS
import
qualified
Data.Vector
as
V
---------------------------------------------------------------
headerCsvGargV3
::
Header
...
...
@@ -193,45 +191,73 @@ hyperdataDocument2csvDoc h = CsvDoc (m $ _hyperdataDocument_title h)
csvDecodeOptions
::
DecodeOptions
csvDecodeOptions
=
(
defaultDecodeOptions
{
decDelimiter
=
fromIntegral
$
ord
'
\t
'
}
)
csvDecodeOptions
=
defaultDecodeOptions
{
decDelimiter
=
delimiter
}
csvEncodeOptions
::
EncodeOptions
csvEncodeOptions
=
(
defaultEncodeOptions
{
encDelimiter
=
fromIntegral
$
ord
'
\t
'
}
)
csvEncodeOptions
=
defaultEncodeOptions
{
encDelimiter
=
delimiter
}
delimiter
::
Word8
delimiter
=
fromIntegral
$
ord
'
\t
'
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsvOn
::
[
CsvDoc
->
Text
]
->
FilePath
->
IO
[
Text
]
readCsvOn
fields
fp
=
V
.
toList
<$>
V
.
map
(
\
l
->
intercalate
(
pack
" "
)
$
map
(
\
field
->
field
l
)
fields
)
<$>
snd
<$>
read
Csv
fp
<$>
read
File
fp
------------------------------------------------------------------------
readCsv
::
FilePath
->
IO
(
Header
,
Vector
CsvDoc
)
readCsv
fp
=
do
csvData
<-
BL
.
readFile
fp
case
decodeByNameWith
csvDecodeOptions
csvData
of
readFileLazy
::
(
FromNamedRecord
a
)
=>
a
->
FilePath
->
IO
(
Header
,
Vector
a
)
readFileLazy
f
=
fmap
(
readByteStringLazy
f
)
.
BL
.
readFile
readFileStrict
::
(
FromNamedRecord
a
)
=>
a
->
FilePath
->
IO
(
Header
,
Vector
a
)
readFileStrict
f
=
fmap
(
readByteStringStrict
f
)
.
BS
.
readFile
readByteStringLazy
::
(
FromNamedRecord
a
)
=>
a
->
BL
.
ByteString
->
(
Header
,
Vector
a
)
readByteStringLazy
f
bs
=
case
decodeByNameWith
csvDecodeOptions
bs
of
Left
e
->
panic
(
pack
e
)
Right
csvDocs
->
csvDocs
readByteStringStrict
::
(
FromNamedRecord
a
)
=>
a
->
BS
.
ByteString
->
(
Header
,
Vector
a
)
readByteStringStrict
ff
=
(
readByteStringLazy
ff
)
.
BL
.
fromStrict
------------------------------------------------------------------------
-- | TODO use readFileLazy
readFile
::
FilePath
->
IO
(
Header
,
Vector
CsvDoc
)
readFile
=
fmap
readCsvLazyBS
.
BL
.
readFile
-- | TODO use readByteStringLazy
readCsvLazyBS
::
BL
.
ByteString
->
(
Header
,
Vector
CsvDoc
)
readCsvLazyBS
bs
=
case
decodeByNameWith
csvDecodeOptions
bs
of
Left
e
->
panic
(
pack
e
)
Right
csvDocs
->
pure
csvDocs
Right
csvDocs
->
csvDocs
------------------------------------------------------------------------
-- | TODO use readFileLazy
readCsvHal
::
FilePath
->
IO
(
Header
,
Vector
CsvHal
)
readCsvHal
=
fmap
readCsvHalLazyBS
.
BL
.
readFile
readHal
::
FilePath
->
IO
(
Header
,
Vector
CsvHal
)
readHal
fp
=
do
csvData
<-
BL
.
readFile
fp
case
decodeByNameWith
csvDecodeOptions
csvData
of
-- | TODO use readByteStringLazy
readCsvHalLazyBS
::
BL
.
ByteString
->
(
Header
,
Vector
CsvHal
)
readCsvHalLazyBS
bs
=
case
decodeByNameWith
csvDecodeOptions
bs
of
Left
e
->
panic
(
pack
e
)
Right
csvDocs
->
pure
csvDocs
Right
csvDocs
->
csvDocs
readCsvHalBSStrict
::
BS
.
ByteString
->
(
Header
,
Vector
CsvHal
)
readCsvHalBSStrict
=
readCsvHalLazyBS
.
BL
.
fromStrict
------------------------------------------------------------------------
write
Csv
::
FilePath
->
(
Header
,
Vector
CsvDoc
)
->
IO
()
write
Csv
fp
(
h
,
vs
)
=
BL
.
writeFile
fp
$
write
File
::
FilePath
->
(
Header
,
Vector
CsvDoc
)
->
IO
()
write
File
fp
(
h
,
vs
)
=
BL
.
writeFile
fp
$
encodeByNameWith
csvEncodeOptions
h
(
V
.
toList
vs
)
writeDocs2Csv
::
FilePath
->
[
HyperdataDocument
]
->
IO
()
writeDocs2Csv
fp
hs
=
BL
.
writeFile
fp
$
encodeByNameWith
csvEncodeOptions
headerCsvGargV3
(
map
hyperdataDocument2csvDoc
hs
)
writeDocs2Csv
fp
hs
=
BL
.
writeFile
fp
$
hyperdataDocument2csv
hs
hyperdataDocument2csv
::
[
HyperdataDocument
]
->
BL
.
ByteString
hyperdataDocument2csv
hs
=
encodeByNameWith
csvEncodeOptions
headerCsvGargV3
(
map
hyperdataDocument2csvDoc
hs
)
------------------------------------------------------------------------
-- Hal Format
data
CsvHal
=
CsvHal
...
...
@@ -340,7 +366,6 @@ csvHal2doc (CsvHal title source
------------------------------------------------------------------------
parseHal
::
FilePath
->
IO
[
HyperdataDocument
]
parseHal
fp
=
map
csvHal2doc
<$>
V
.
toList
<$>
snd
<$>
readHal
fp
parseHal
fp
=
map
csvHal2doc
<$>
V
.
toList
<$>
snd
<$>
read
Csv
Hal
fp
------------------------------------------------------------------------
src/Gargantext/Text/Parsers/Date.hs
View file @
110c6265
...
...
@@ -18,12 +18,12 @@ DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Text.Parsers.Date
(
parse
Date
,
parseDateRaw
)
where
module
Gargantext.Text.Parsers.Date
(
parse
,
parseRaw
,
split
)
where
import
Data.HashMap.Strict
as
HM
hiding
(
map
)
import
Data.Text
(
Text
,
unpack
,
splitOn
,
pack
)
import
Data.Time
(
parseTimeOrError
,
defaultTimeLocale
)
import
Data.Time.Clock
(
UTCTime
,
getCurrentTime
)
import
Data.Time
(
parseTimeOrError
,
defaultTimeLocale
,
toGregorian
)
import
Data.Time.Clock
(
UTCTime
(
..
)
,
getCurrentTime
)
import
Data.Time.LocalTime
(
utc
)
import
Data.Time.LocalTime.TimeZone.Series
(
zonedTimeToZoneSeriesTime
)
import
Duckling.Api
(
analyze
)
...
...
@@ -38,21 +38,41 @@ import qualified Data.HashSet as HashSet
import
qualified
Duckling.Core
as
DC
------------------------------------------------------------------------
-- | Parse date to Ints
-- TODO add hours, minutes and seconds
split
::
Lang
->
Maybe
Text
->
IO
(
Maybe
UTCTime
,
(
Maybe
Year
,
Maybe
Month
,
Maybe
Day
))
split
_
Nothing
=
pure
(
Nothing
,
(
Nothing
,
Nothing
,
Nothing
))
split
l
(
Just
txt
)
=
do
utcTime
<-
parse
l
txt
let
(
y
,
m
,
d
)
=
split'
utcTime
pure
(
Just
utcTime
,
(
Just
y
,
Just
m
,
Just
d
))
split'
::
UTCTime
->
(
Year
,
Month
,
Day
)
split'
utcTime
=
(
fromIntegral
y
,
m
,
d
)
where
(
UTCTime
day
_
)
=
utcTime
(
y
,
m
,
d
)
=
toGregorian
day
type
Year
=
Int
type
Month
=
Int
type
Day
=
Int
------------------------------------------------------------------------
-- | Date Parser
-- Parses dates mentions in full text given the language.
-- >>> parseDate FR (pack "10 avril 1979 à 19H")
-- 1979-04-10 19:00:00 UTC
-- >>> parseDate EN (pack "April 10 1979")
-- 1979-04-10 00:00:00 UTC
parse
Date
::
Lang
->
Text
->
IO
UTCTime
parse
Date
lang
s
=
parseDate'
"%Y-%m-%dT%T"
"0-0-0T0:0:0"
lang
s
parse
::
Lang
->
Text
->
IO
UTCTime
parse
lang
s
=
parseDate'
"%Y-%m-%dT%T"
"0-0-0T0:0:0"
lang
s
type
DateFormat
=
Text
type
DateDefault
=
Text
parseDate'
::
DateFormat
->
DateDefault
->
Lang
->
Text
->
IO
UTCTime
parseDate'
format
def
lang
s
=
do
dateStr'
<-
parse
Date
Raw
lang
s
dateStr'
<-
parseRaw
lang
s
let
dateStr
=
unpack
$
maybe
def
identity
$
head
$
splitOn
"."
dateStr'
pure
$
parseTimeOrError
True
defaultTimeLocale
(
unpack
format
)
dateStr
...
...
@@ -70,19 +90,19 @@ parserLang EN = DC.EN
-- IO can be avoided here:
-- currentContext :: Lang -> IO Context
-- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
-- parse
Date
Raw :: Context -> Text -> SomeErrorHandling Text
-- parseRaw :: Context -> Text -> SomeErrorHandling Text
-- TODO error handling
parse
Date
Raw
::
Lang
->
Text
->
IO
(
Text
)
parse
Date
Raw
lang
text
=
do
parseRaw
::
Lang
->
Text
->
IO
(
Text
)
parseRaw
lang
text
=
do
maybeJson
<-
map
jsonValue
<$>
parseDateWithDuckling
lang
text
case
headMay
maybeJson
of
Just
(
Json
.
Object
object
)
->
case
HM
.
lookup
"value"
object
of
Just
(
Json
.
String
date
)
->
pure
date
Just
_
->
panic
"Parse
Date
Raw ERROR: should be a json String"
Nothing
->
panic
$
"Parse
Date
Raw ERROR: no date found"
<>
(
pack
.
show
)
lang
<>
" "
<>
text
Just
_
->
panic
"ParseRaw ERROR: should be a json String"
Nothing
->
panic
$
"ParseRaw ERROR: no date found"
<>
(
pack
.
show
)
lang
<>
" "
<>
text
_
->
panic
$
"Parse
Date
Raw ERROR: type error"
<>
(
pack
.
show
)
lang
<>
" "
<>
text
_
->
panic
$
"ParseRaw ERROR: type error"
<>
(
pack
.
show
)
lang
<>
" "
<>
text
-- | Current Time in DucklingTime format
...
...
src/Gargantext/Text/Parsers/Json2Csv.hs
View file @
110c6265
...
...
@@ -28,7 +28,7 @@ import Data.Text (Text, unpack)
import
Gargantext.Core.Utils.Prefix
(
unPrefix
)
import
Gargantext.Prelude
import
System.IO
(
FilePath
)
import
Gargantext.Text.Parsers.CSV
(
CsvDoc
(
..
),
write
Csv
,
headerCsvGargV3
)
import
Gargantext.Text.Parsers.CSV
(
CsvDoc
(
..
),
write
File
,
headerCsvGargV3
)
import
Data.Vector
(
fromList
)
data
Patent
=
Patent
{
_patent_title
::
Text
...
...
@@ -48,7 +48,7 @@ type FilePathOut = FilePath
json2csv
::
FilePathIn
->
FilePathOut
->
IO
()
json2csv
fin
fout
=
do
patents
<-
maybe
(
panic
"json2csv error"
)
identity
<$>
readPatents
fin
write
Csv
fout
(
headerCsvGargV3
,
fromList
$
map
patent2csvDoc
patents
)
write
File
fout
(
headerCsvGargV3
,
fromList
$
map
patent2csvDoc
patents
)
patent2csvDoc
::
Patent
->
CsvDoc
patent2csvDoc
(
Patent
title
abstract
year
_
)
=
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment