Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
160
Issues
160
List
Board
Labels
Milestones
Merge Requests
14
Merge Requests
14
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
3dd27b5a
Verified
Commit
3dd27b5a
authored
Oct 06, 2023
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[istex] implement zip file upload for istex
parent
03bd0bf6
Pipeline
#5213
failed with stages
in 12 minutes and 11 seconds
Changes
21
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
21 changed files
with
305 additions
and
300 deletions
+305
-300
CleanCsvCorpus.hs
bin/gargantext-cli/CleanCsvCorpus.hs
+1
-1
Main.hs
bin/gargantext-cli/Main.hs
+17
-25
cabal.project
cabal.project
+2
-1
gargantext.cabal
gargantext.cabal
+3
-0
Contact.hs
src/Gargantext/API/Node/Contact.hs
+1
-1
New.hs
src/Gargantext/API/Node/Corpus/New.hs
+16
-14
Types.hs
src/Gargantext/API/Node/Corpus/New/Types.hs
+2
-0
FrameCalcUpload.hs
src/Gargantext/API/Node/FrameCalcUpload.hs
+8
-1
IMT.hs
src/Gargantext/Core/Ext/IMT.hs
+8
-13
Convert.hs
src/Gargantext/Core/Text/Convert.hs
+4
-6
Istex.hs
src/Gargantext/Core/Text/Corpus/API/Istex.hs
+7
-37
Parsers.hs
src/Gargantext/Core/Text/Corpus/Parsers.hs
+78
-86
CSV.hs
src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
+43
-51
Date.hs
src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
+2
-3
JSON.hs
src/Gargantext/Core/Text/Corpus/Parsers/JSON.hs
+24
-16
Flow.hs
src/Gargantext/Database/Action/Flow.hs
+35
-45
Aeson.hs
src/Gargantext/Utils/Aeson.hs
+12
-0
Jobs.hs
src/Gargantext/Utils/Jobs.hs
+10
-0
Servant.hs
src/Gargantext/Utils/Servant.hs
+10
-0
Tuple.hs
src/Gargantext/Utils/Tuple.hs
+11
-0
UTCTime.hs
src/Gargantext/Utils/UTCTime.hs
+11
-0
No files found.
bin/gargantext-cli/CleanCsvCorpus.hs
View file @
3dd27b5a
...
...
@@ -55,4 +55,4 @@ main = do
putStrLn
$
"Mean size of docs:"
<>
show
(
CSV
.
docsSize
docs'
)
CSV
.
writeFile
wPath
(
h
,
docs'
)
Left
e
->
panic
$
"Error: "
<>
(
pack
e
)
Left
e
->
panic
$
"Error: "
<>
e
bin/gargantext-cli/Main.hs
View file @
3dd27b5a
...
...
@@ -18,34 +18,25 @@ Main specifications to index a corpus with a term list
module
Main
where
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Control.Concurrent
(
getNumCapabilities
,
myThreadId
,
threadCapability
)
import
Control.Monad
(
zipWithM
)
import
Control.Monad.IO.Class
import
Data.Aeson
import
Data.ByteString.Lazy
(
writeFile
)
import
Data.Either
(
Either
(
..
))
import
Data.List
(
cycle
,
concat
,
unwords
)
import
Data.List.Split
(
chunksOf
)
import
Data.Map.Strict
(
Map
)
import
qualified
Data.Map.Strict
as
DM
import
Data.Text
(
pack
,
Text
)
import
qualified
Data.Text
as
DT
import
Data.Map.Strict
qualified
as
DM
import
Data.Text
(
pack
)
import
Data.Text
qualified
as
DT
import
Data.Text.Lazy
qualified
as
DTL
import
Data.Text.Lazy.Encoding
qualified
as
TLE
import
Data.Tuple.Extra
(
both
)
import
qualified
Data.Vector
as
DV
import
Data.Vector
qualified
as
DV
import
GHC.Generics
import
System.IO
(
hPutStr
,
hFlush
,
stderr
)
import
System.Environment
import
Gargantext.Prelude
import
Gargantext.Core
import
Gargantext.Core.Types
import
Gargantext.Core.Text.Terms
import
Gargantext.Core.Text.Context
import
Gargantext.Core.Text.Terms.WithList
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
readCSVFile
,
csv_title
,
csv_abstract
,
csv_publication_year
,
unIntOrDec
,
fromMIntOrDec
,
defaultYear
)
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
readCSVFile
,
csv_title
,
csv_abstract
,
csv_publication_year
,
fromMIntOrDec
,
defaultYear
)
import
Gargantext.Core.Text.List.Formats.CSV
(
csvMapTermList
)
import
Gargantext.Core.Text.Terms
(
terms
)
import
Gargantext.Core.Text.Metrics.Count
(
coocOnContexts
,
Coocs
)
import
Gargantext.Core.Text.Terms.WithList
import
Gargantext.Prelude
hiding
(
show
)
import
Protolude
import
System.IO
(
hFlush
)
------------------------------------------------------------------------
-- OUTPUT format
...
...
@@ -78,7 +69,7 @@ filterTermsAndCooc patterns (year, ts) = do
log
m
=
do
tid
<-
myThreadId
(
p
,
_
)
<-
threadCapability
tid
put
StrLn
.
unwords
$
put
Text
.
unwords
$
[
"filterTermsAndCooc:"
,
m
,
show
year
,
"on proc"
,
show
p
]
main
::
IO
()
...
...
@@ -97,14 +88,14 @@ main = do
-- termListMap :: [Text]
termList
<-
csvMapTermList
termListFile
put
StrLn
$
show
$
length
termList
put
Text
$
show
$
length
termList
let
patterns
=
buildPatterns
termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r
<-
mapConcurrently
(
filterTermsAndCooc
patterns
)
(
DM
.
toList
corpus
)
writeFile
outputFile
$
encode
(
CoocByYears
r
)
Left
e
->
panic
$
"Error: "
<>
(
pack
e
)
writeFile
outputFile
$
DTL
.
toStrict
$
TLE
.
decodeUtf8
$
encode
(
CoocByYears
r
)
Left
e
->
panic
$
"Error: "
<>
e
...
...
@@ -113,7 +104,7 @@ main = do
mapMP
::
MonadIO
m
=>
(
a
->
m
b
)
->
[
a
]
->
m
[
b
]
mapMP
f
xs
=
do
bs
<-
zipWithM
g
(
cycle
"-
\\
|/"
)
xs
liftIO
$
hPutStr
stderr
"
\r
Done
\n
"
liftIO
$
hPutStr
stderr
(
"
\r
Done
\n
"
::
Text
)
pure
bs
where
g
c
x
=
do
...
...
@@ -130,6 +121,7 @@ mapConcurrentlyChunked f ts = do
--terms' :: Patterns -> Text -> Corpus [[Text]]
terms'
::
Applicative
f
=>
Patterns
->
Text
->
f
[[
Text
]]
terms'
pats
txt
=
pure
$
concat
$
extractTermsWithList
pats
txt
...
...
cabal.project
View file @
3dd27b5a
...
...
@@ -94,7 +94,7 @@ source-repository-package
source
-
repository
-
package
type
:
git
location
:
https
://
gitlab
.
iscpif
.
fr
/
gargantext
/
crawlers
/
istex
.
git
tag
:
a34bb341236d82cf3d488210bc1d8448a98f5808
tag
:
9
b1bd17f3ed38eab83e675bb68278922217a9c73
source
-
repository
-
package
type
:
git
...
...
@@ -167,3 +167,4 @@ package hmatrix
package
sparse
-
linear
ghc
-
options
:
-
O2
-
fsimpl
-
tick
-
factor
=
10000
-
fdicts
-
cheap
-
fdicts
-
strict
-
flate
-
dmd
-
anal
-
fno
-
state
-
hack
gargantext.cabal
View file @
3dd27b5a
...
...
@@ -147,6 +147,7 @@ library
Gargantext.Utils.Jobs.State
Gargantext.Utils.SpacyNLP
Gargantext.Utils.Tuple
Gargantext.Utils.Zip
other-modules:
Gargantext.API.Admin.Auth
Gargantext.API.Admin.FrontEnd
...
...
@@ -229,6 +230,7 @@ library
Gargantext.Core.Text.Corpus.Parsers.Iramuteq
Gargantext.Core.Text.Corpus.Parsers.Isidore
Gargantext.Core.Text.Corpus.Parsers.JSON
Gargantext.Core.Text.Corpus.Parsers.JSON.Istex
Gargantext.Core.Text.Corpus.Parsers.Json2Csv
Gargantext.Core.Text.Corpus.Parsers.RIS
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
...
...
@@ -665,6 +667,7 @@ executable gargantext-cli
, gargantext-prelude
, ini ^>= 0.4.1
, optparse-generic ^>= 1.4.7
, protolude ^>= 0.3.0
, split ^>= 0.2.3.4
, text ^>= 1.2.4.1
, unordered-containers ^>= 0.2.16.0
...
...
src/Gargantext/API/Node/Contact.hs
View file @
3dd27b5a
...
...
@@ -87,7 +87,7 @@ addContact :: (HasSettings env, FlowCmdM env err m, MonadJobStatus m)
addContact
u
nId
(
AddContactParams
fn
ln
)
jobHandle
=
do
markStarted
2
jobHandle
_
<-
flow
(
Nothing
::
Maybe
HyperdataAnnuaire
)
u
(
Right
[
nId
])
(
Multi
EN
)
Nothing
(
Just
1
,
yield
$
hyperdataContact
fn
ln
)
jobHandle
_
<-
flow
(
Nothing
::
Maybe
HyperdataAnnuaire
)
u
(
Right
[
nId
])
(
Multi
EN
)
Nothing
(
1
,
yield
$
hyperdataContact
fn
ln
)
jobHandle
markComplete
jobHandle
addContact
_uId
_nId
_p
jobHandle
=
do
...
...
src/Gargantext/API/Node/Corpus/New.hs
View file @
3dd27b5a
...
...
@@ -42,7 +42,7 @@ import Gargantext.API.Node.Corpus.Searx
import
Gargantext.API.Node.Corpus.Types
import
Gargantext.API.Node.Corpus.Update
(
addLanguageToCorpus
)
import
Gargantext.API.Node.Types
import
Gargantext.Core
(
Lang
(
..
),
withDefaultLanguage
)
import
Gargantext.Core
(
Lang
(
..
),
withDefaultLanguage
,
defaultLanguage
)
import
Gargantext.Core.NodeStory
(
HasNodeStoryImmediateSaver
,
HasNodeArchiveStoryImmediateSaver
,
currentVersion
)
import
Gargantext.Core.Text.Corpus.API
qualified
as
API
import
Gargantext.Core.Text.Corpus.Parsers
qualified
as
Parser
(
FileType
(
..
),
parseFormatC
)
...
...
@@ -278,33 +278,35 @@ addToCorpusWithForm :: (FlowCmdM env err m, MonadJobStatus m)
->
NewWithForm
->
JobHandle
m
->
m
()
addToCorpusWithForm
user
cid
(
NewWithForm
ft
ff
d
(
withDefaultLanguage
->
l
)
_n
sel
)
jobHandle
=
do
addToCorpusWithForm
user
cid
nwf
jobHandle
=
do
-- printDebug "[addToCorpusWithForm] Parsing corpus: " cid
-- printDebug "[addToCorpusWithForm] fileType" ft
-- printDebug "[addToCorpusWithForm] fileFormat" ff
let
l
=
nwf
^.
wf_lang
.
non
defaultLanguage
addLanguageToCorpus
cid
l
limit'
<-
view
$
hasConfig
.
gc_max_docs_parsers
let
limit
=
fromIntegral
limit'
::
Integer
let
parseC
=
case
ft
of
CSV_HAL
->
Parser
.
parseFormatC
Parser
.
CsvHal
parseC
=
case
(
nwf
^.
wf_filetype
)
of
CSV
->
Parser
.
parseFormatC
Parser
.
CsvGargV3
WOS
->
Parser
.
parseFormatC
Parser
.
WOS
PresseRIS
->
Parser
.
parseFormatC
Parser
.
RisPresse
CSV_HAL
->
Parser
.
parseFormatC
Parser
.
CsvHal
Iramuteq
->
Parser
.
parseFormatC
Parser
.
Iramuteq
Istex
->
Parser
.
parseFormatC
Parser
.
Istex
JSON
->
Parser
.
parseFormatC
Parser
.
JSON
PresseRIS
->
Parser
.
parseFormatC
Parser
.
RisPresse
WOS
->
Parser
.
parseFormatC
Parser
.
WOS
-- TODO granularity of the logStatus
let
data
'
=
case
ff
of
Plain
->
cs
d
ZIP
->
case
BSB64
.
decode
$
TE
.
encodeUtf8
d
of
let
data
'
=
case
(
nwf
^.
wf_fileformat
)
of
Plain
->
cs
(
nwf
^.
wf_data
)
ZIP
->
case
BSB64
.
decode
$
TE
.
encodeUtf8
(
nwf
^.
wf_data
)
of
Left
err
->
panic
$
T
.
pack
"[addToCorpusWithForm] error decoding base64: "
<>
T
.
pack
err
Right
decoded
->
decoded
eDocsC
<-
liftBase
$
parseC
ff
data
'
eDocsC
<-
liftBase
$
parseC
(
nwf
^.
wf_fileformat
)
data
'
case
eDocsC
of
Right
(
mC
ount
,
docsC
)
->
do
Right
(
c
ount
,
docsC
)
->
do
-- TODO Add progress (jobStatus) update for docs - this is a
-- long action
...
...
@@ -333,9 +335,9 @@ addToCorpusWithForm user cid (NewWithForm ft ff d (withDefaultLanguage -> l) _n
_cid'
<-
flowCorpus
user
(
Right
[
cid
])
(
Multi
l
)
(
Just
sel
)
(
Just
(
nwf
^.
wf_selection
)
)
--(Just $ fromIntegral $ length docs, docsC')
(
mC
ount
,
transPipe
liftBase
docsC'
)
-- TODO fix number of docs
(
c
ount
,
transPipe
liftBase
docsC'
)
-- TODO fix number of docs
--(map (map toHyperdataDocument) docs)
jobHandle
...
...
@@ -347,7 +349,7 @@ addToCorpusWithForm user cid (NewWithForm ft ff d (withDefaultLanguage -> l) _n
markComplete
jobHandle
Left
e
->
do
printDebug
"[addToCorpusWithForm] parse error"
e
markFailed
(
Just
$
T
.
pack
e
)
jobHandle
markFailed
(
Just
e
)
jobHandle
{-
addToCorpusWithFile :: FlowCmdM env err m
...
...
src/Gargantext/API/Node/Corpus/New/Types.hs
View file @
3dd27b5a
...
...
@@ -12,6 +12,7 @@ import Gargantext.Prelude
data
FileType
=
CSV
|
CSV_HAL
|
Istex
|
PresseRIS
|
WOS
|
Iramuteq
...
...
@@ -26,6 +27,7 @@ instance ToJSON FileType
instance
FromHttpApiData
FileType
where
parseUrlPiece
"CSV"
=
pure
CSV
parseUrlPiece
"CSV_HAL"
=
pure
CSV_HAL
parseUrlPiece
"Istex"
=
pure
Istex
parseUrlPiece
"PresseRis"
=
pure
PresseRIS
parseUrlPiece
"WOS"
=
pure
WOS
parseUrlPiece
"Iramuteq"
=
pure
Iramuteq
...
...
src/Gargantext/API/Node/FrameCalcUpload.hs
View file @
3dd27b5a
...
...
@@ -90,6 +90,13 @@ frameCalcUploadAsync uId nId (FrameCalcUpload _wf_lang _wf_selection) jobHandle
case
mCId
of
Nothing
->
markFailure
1
Nothing
jobHandle
Just
cId
->
addToCorpusWithForm
(
RootId
(
NodeId
uId
))
cId
(
NewWithForm
CSV
Plain
body
_wf_lang
"calc-upload.csv"
_wf_selection
)
jobHandle
addToCorpusWithForm
(
RootId
(
NodeId
uId
))
cId
(
NewWithForm
{
_wf_filetype
=
CSV
,
_wf_fileformat
=
Plain
,
_wf_data
=
body
,
_wf_lang
,
_wf_name
=
"calc-upload.csv"
,
_wf_selection
})
jobHandle
markComplete
jobHandle
src/Gargantext/Core/Ext/IMT.hs
View file @
3dd27b5a
...
...
@@ -14,22 +14,17 @@ Portability : POSIX
module
Gargantext.Core.Ext.IMT
where
import
Data.Either
(
Either
(
..
))
import
Data.List
qualified
as
DL
import
Data.Map.Strict
(
Map
)
import
Data.Text
(
Text
,
splitOn
)
import
qualified
Data.Set
as
S
import
qualified
Data.List
as
DL
import
qualified
Data.Vector
as
DV
import
qualified
Data.Map.Strict
as
M
import
qualified
Prelude
import
Data.Map.Strict
qualified
as
M
import
Data.Morpheus.Types
(
GQLType
)
import
Data.Set
qualified
as
S
import
Data.Text
(
Text
,
splitOn
)
import
Data.Vector
qualified
as
DV
import
GHC.Generics
(
Generic
)
import
Gargantext.Prelude
import
Gargantext.Core.Text.Metrics.Utils
as
Utils
import
Gargantext.Core.Text.Corpus.Parsers.CSV
as
CSV
import
Gargantext.Core.Text.Metrics.Utils
as
Utils
import
Gargantext.Prelude
data
School
=
School
{
school_shortName
::
Text
,
school_longName
::
Text
...
...
@@ -112,7 +107,7 @@ mapIdSchool :: Map Text Text
mapIdSchool
=
M
.
fromList
$
Gargantext
.
Prelude
.
map
(
\
(
School
{
school_shortName
,
school_id
})
->
(
school_id
,
school_shortName
))
schools
hal_data
::
IO
(
Either
Prelude
.
String
(
DV
.
Vector
CsvHal
))
hal_data
::
IO
(
Either
Text
(
DV
.
Vector
CsvHal
))
hal_data
=
do
r
<-
CSV
.
readCsvHal
"doc/corpus_imt/Gargantext_Corpus.csv"
pure
$
snd
<$>
r
...
...
src/Gargantext/Core/Text/Convert.hs
View file @
3dd27b5a
...
...
@@ -17,12 +17,10 @@ module Gargantext.Core.Text.Convert (risPress2csvWrite)
where
import
Data.Either
(
Either
(
..
))
import
qualified
Data.Text
as
T
import
System.FilePath
(
FilePath
())
-- , takeExtension)
import
Gargantext.Prelude
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
writeDocs2Csv
)
import
Gargantext.Core.Text.Corpus.Parsers
(
parseFile
,
FileFormat
(
..
),
FileType
(
..
))
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
writeDocs2Csv
)
import
Gargantext.Prelude
import
System.FilePath
(
FilePath
())
-- , takeExtension)
risPress2csvWrite
::
FilePath
->
IO
()
...
...
@@ -30,7 +28,7 @@ risPress2csvWrite f = do
eContents
<-
parseFile
RisPresse
Plain
(
f
<>
".ris"
)
case
eContents
of
Right
contents
->
writeDocs2Csv
(
f
<>
".csv"
)
contents
Left
e
->
panic
$
"Error: "
<>
(
T
.
pack
e
)
Left
e
->
panic
$
"Error: "
<>
e
src/Gargantext/Core/Text/Corpus/API/Istex.hs
View file @
3dd27b5a
...
...
@@ -14,19 +14,16 @@ module Gargantext.Core.Text.Corpus.API.Istex
where
import
Data.Either
(
Either
(
..
))
import
Data.List
(
concat
)
import
Data.List
qualified
as
List
import
Data.Maybe
import
Data.Text
(
Text
,
pack
)
import
qualified
Data.Text
as
Text
import
qualified
Data.List
as
List
import
Data.Text
(
Text
)
import
Data.Text
qualified
as
Text
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Core.Text.Corpus.Parsers.JSON.Istex
(
toDoc
)
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
qualified
Gargantext.Defaults
as
Defaults
import
Gargantext.Prelude
import
qualified
Gargantext.Core.Text.Corpus.Parsers.Date
as
Date
import
qualified
ISTEX
as
ISTEX
import
qualified
ISTEX.Client
as
ISTEX
import
ISTEX
qualified
as
ISTEX
import
ISTEX.Client
qualified
as
ISTEX
type
Query
=
Text
type
MaxResults
=
Maybe
Int
...
...
@@ -76,31 +73,4 @@ toDoc' :: Lang -> ISTEX.Documents -> IO [HyperdataDocument]
toDoc'
la
docs'
=
mapM
(
toDoc
la
)
(
ISTEX
.
_documents_hits
docs'
)
--printDebug "ISTEX" (ISTEX._documents_total docs')
-- | TODO remove dateSplit here
-- TODO current year as default
toDoc
::
Lang
->
ISTEX
.
Document
->
IO
HyperdataDocument
toDoc
la
(
ISTEX
.
Document
i
t
a
ab
d
s
)
=
do
--printDebug "ISTEX date" d
(
utctime
,
(
pub_year
,
pub_month
,
pub_day
))
<-
Date
.
dateSplit
(
maybe
(
Just
$
pack
$
show
Defaults
.
year
)
(
Just
.
pack
.
show
)
d
)
--printDebug "toDoc Istex" (utctime, (pub_year, pub_month, pub_day))
pure
$
HyperdataDocument
{
_hd_bdd
=
Just
"Istex"
,
_hd_doi
=
Just
i
,
_hd_url
=
Nothing
,
_hd_uniqId
=
Nothing
,
_hd_uniqIdBdd
=
Nothing
,
_hd_page
=
Nothing
,
_hd_title
=
t
,
_hd_authors
=
Just
$
foldl
(
\
x
y
->
x
<>
", "
<>
y
)
""
(
map
ISTEX
.
_author_name
a
)
,
_hd_institutes
=
Just
$
foldl
(
\
x
y
->
x
<>
", "
<>
y
)
""
(
concat
$
(
map
ISTEX
.
_author_affiliations
)
a
)
,
_hd_source
=
Just
$
foldl
(
\
x
y
->
x
<>
", "
<>
y
)
""
(
catMaybes
$
map
ISTEX
.
_source_title
s
)
,
_hd_abstract
=
ab
,
_hd_publication_date
=
fmap
(
pack
.
show
)
utctime
,
_hd_publication_year
=
pub_year
,
_hd_publication_month
=
pub_month
,
_hd_publication_day
=
pub_day
,
_hd_publication_hour
=
Nothing
,
_hd_publication_minute
=
Nothing
,
_hd_publication_second
=
Nothing
,
_hd_language_iso2
=
Just
$
(
pack
.
show
)
la
}
src/Gargantext/Core/Text/Corpus/Parsers.hs
View file @
3dd27b5a
This diff is collapsed.
Click to expand it.
src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
View file @
3dd27b5a
...
...
@@ -16,25 +16,19 @@ module Gargantext.Core.Text.Corpus.Parsers.CSV where
import
Conduit
import
Control.Applicative
import
qualified
Data.ByteString
as
BS
import
qualified
Data.ByteString.Lazy
as
BL
import
Data.Char
(
ord
)
import
Data.ByteString
qualified
as
BS
import
Data.ByteString.Lazy
qualified
as
BL
import
Data.Csv
import
Data.Either
(
Either
(
..
))
import
Data.Maybe
(
fromMaybe
)
import
Data.Text
(
Text
,
pack
,
length
,
intercalate
)
import
Data.Text
(
pack
)
import
Data.Text
qualified
as
T
import
Data.Time.Segment
(
jour
)
import
qualified
Data.Vector
as
V
import
Data.Vector
(
Vector
)
import
GHC.IO
(
FilePath
)
import
GHC.Word
(
Word8
)
import
qualified
Prelude
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
hiding
(
length
)
import
Data.Vector
qualified
as
V
import
Gargantext.Core.Text
import
Gargantext.Core.Text.Context
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
hiding
(
length
,
show
)
import
Protolude
---------------------------------------------------------------
headerCsvGargV3
::
Header
...
...
@@ -117,7 +111,7 @@ fromDocs docs = V.map fromDocs' docs
-- | Split a document in its context
-- TODO adapt the size of the paragraph according to the corpus average
splitDoc
::
Mean
->
SplitContext
->
CsvDoc
->
Vector
CsvDoc
splitDoc
m
splt
doc
=
let
docSize
=
(
length
$
csv_abstract
doc
)
in
splitDoc
m
splt
doc
=
let
docSize
=
(
T
.
length
$
csv_abstract
doc
)
in
if
docSize
>
1000
then
if
(
mod
(
round
m
)
docSize
)
>=
10
...
...
@@ -148,7 +142,7 @@ type Mean = Double
docsSize
::
Vector
CsvDoc
->
Mean
docsSize
csvDoc
=
mean
ls
where
ls
=
V
.
toList
$
V
.
map
(
fromIntegral
.
length
.
csv_abstract
)
csvDoc
ls
=
V
.
toList
$
V
.
map
(
fromIntegral
.
T
.
length
.
csv_abstract
)
csvDoc
---------------------------------------------------------------
...
...
@@ -158,7 +152,7 @@ unIntOrDec :: IntOrDec -> Int
unIntOrDec
(
IntOrDec
i
)
=
i
instance
FromField
IntOrDec
where
parseField
s
=
case
runParser
(
parseField
s
::
Parser
Int
)
of
Left
_err
->
IntOrDec
<$>
Prelude
.
floor
<$>
(
parseField
s
::
Parser
Double
)
Left
_err
->
IntOrDec
<$>
floor
<$>
(
parseField
s
::
Parser
Double
)
Right
n
->
pure
$
IntOrDec
n
instance
ToField
IntOrDec
where
toField
(
IntOrDec
i
)
=
toField
i
...
...
@@ -230,44 +224,40 @@ csvEncodeOptions d = defaultEncodeOptions {encDelimiter = delimiter d}
delimiter
::
Delimiter
->
Word8
delimiter
Tab
=
fromIntegral
$
ord
'
\t
'
delimiter
Comma
=
fromIntegral
$
ord
','
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsvOn'
::
[
CsvDoc
->
Text
]
->
FilePath
->
IO
(
Either
Prelude
.
String
[
Text
])
readCsvOn'
fields
fp
=
do
r
<-
readCSVFile
fp
pure
$
(
V
.
toList
.
V
.
map
(
\
l
->
intercalate
(
pack
" "
)
$
map
(
\
field
->
field
l
)
fields
)
.
snd
)
<$>
r
------------------------------------------------------------------------
readFileLazy
::
(
FromNamedRecord
a
)
=>
proxy
a
->
Delimiter
->
FilePath
->
IO
(
Either
Prelude
.
String
(
Header
,
Vector
a
))
readFileLazy
::
(
FromNamedRecord
a
)
=>
proxy
a
->
Delimiter
->
FilePath
->
IO
(
Either
Text
(
Header
,
Vector
a
))
readFileLazy
d
f
=
fmap
(
readByteStringLazy
d
f
)
.
BL
.
readFile
readFileStrict
::
(
FromNamedRecord
a
)
=>
proxy
a
->
Delimiter
->
FilePath
->
IO
(
Either
Prelude
.
String
(
Header
,
Vector
a
))
->
IO
(
Either
Text
(
Header
,
Vector
a
))
readFileStrict
d
f
=
fmap
(
readByteStringStrict
d
f
)
.
BS
.
readFile
readByteStringLazy
::
(
FromNamedRecord
a
)
=>
proxy
a
->
Delimiter
->
BL
.
ByteString
->
Either
Prelude
.
String
(
Header
,
Vector
a
)
readByteStringLazy
_f
d
bs
=
decodeByNameWith
(
csvDecodeOptions
d
)
bs
->
Either
Text
(
Header
,
Vector
a
)
readByteStringLazy
_f
d
bs
=
first
pack
$
decodeByNameWith
(
csvDecodeOptions
d
)
bs
readByteStringStrict
::
(
FromNamedRecord
a
)
=>
proxy
a
->
Delimiter
->
BS
.
ByteString
->
Either
Prelude
.
String
(
Header
,
Vector
a
)
->
Either
Text
(
Header
,
Vector
a
)
readByteStringStrict
d
ff
=
(
readByteStringLazy
d
ff
)
.
BL
.
fromStrict
------------------------------------------------------------------------
-- | TODO use readFileLazy
readCSVFile
::
FilePath
->
IO
(
Either
Prelude
.
String
(
Header
,
Vector
CsvDoc
))
readCSVFile
::
FilePath
->
IO
(
Either
Text
(
Header
,
Vector
CsvDoc
))
readCSVFile
fp
=
do
result
<-
fmap
(
readCsvLazyBS
Comma
)
$
BL
.
readFile
fp
case
result
of
...
...
@@ -277,20 +267,24 @@ readCSVFile fp = do
-- | TODO use readByteStringLazy
readCsvLazyBS
::
Delimiter
->
BL
.
ByteString
->
Either
Prelude
.
String
(
Header
,
Vector
CsvDoc
)
readCsvLazyBS
d
bs
=
decodeByNameWith
(
csvDecodeOptions
d
)
bs
readCsvLazyBS
::
Delimiter
->
BL
.
ByteString
->
Either
Text
(
Header
,
Vector
CsvDoc
)
readCsvLazyBS
d
bs
=
first
pack
$
decodeByNameWith
(
csvDecodeOptions
d
)
bs
------------------------------------------------------------------------
-- | TODO use readFileLazy
readCsvHal
::
FilePath
->
IO
(
Either
Prelude
.
String
(
Header
,
Vector
CsvHal
))
readCsvHal
=
fmap
readCsvHalLazyBS
.
BL
.
readFile
readCsvHal
::
FilePath
->
IO
(
Either
Text
(
Header
,
Vector
CsvHal
))
readCsvHal
fp
=
do
c
<-
BL
.
readFile
fp
pure
$
readCsvHalLazyBS
c
-- | TODO use readByteStringLazy
readCsvHalLazyBS
::
BL
.
ByteString
->
Either
Prelude
.
String
(
Header
,
Vector
CsvHal
)
readCsvHalLazyBS
bs
=
decodeByNameWith
(
csvDecodeOptions
Tab
)
bs
readCsvHalLazyBS
::
BL
.
ByteString
->
Either
Text
(
Header
,
Vector
CsvHal
)
readCsvHalLazyBS
bs
=
first
pack
$
decodeByNameWith
(
csvDecodeOptions
Tab
)
bs
readCsvHalBSStrict
::
BS
.
ByteString
->
Either
Prelude
.
String
(
Header
,
Vector
CsvHal
)
readCsvHalBSStrict
=
readCsvHalLazyBS
.
BL
.
fromStrict
readCsvHalBSStrict
::
BS
.
ByteString
->
Either
Text
(
Header
,
Vector
CsvHal
)
readCsvHalBSStrict
bs
=
readCsvHalLazyBS
$
BL
.
fromStrict
bs
------------------------------------------------------------------------
writeFile
::
FilePath
->
(
Header
,
Vector
CsvDoc
)
->
IO
()
...
...
@@ -437,25 +431,25 @@ csv2doc (CsvDoc { .. })
pubDay
=
fromMaybe
defaultDay
csv_publication_day
------------------------------------------------------------------------
parseHal
::
FilePath
->
IO
(
Either
Prelude
.
String
[
HyperdataDocument
])
parseHal
::
FilePath
->
IO
(
Either
Text
[
HyperdataDocument
])
parseHal
fp
=
do
r
<-
readCsvHal
fp
pure
$
(
V
.
toList
.
V
.
map
csvHal2doc
.
snd
)
<$>
r
parseHal'
::
BL
.
ByteString
->
Either
Prelude
.
String
[
HyperdataDocument
]
parseHal'
::
BL
.
ByteString
->
Either
Text
[
HyperdataDocument
]
parseHal'
bs
=
(
V
.
toList
.
V
.
map
csvHal2doc
.
snd
)
<$>
readCsvHalLazyBS
bs
------------------------------------------------------------------------
parseCsv
::
FilePath
->
IO
(
Either
Prelude
.
String
[
HyperdataDocument
])
parseCsv
::
FilePath
->
IO
(
Either
Text
[
HyperdataDocument
])
parseCsv
fp
=
fmap
(
V
.
toList
.
V
.
map
csv2doc
.
snd
)
<$>
readCSVFile
fp
{-
parseCsv' :: BL.ByteString -> Either
Prelude.String
[HyperdataDocument]
parseCsv' :: BL.ByteString -> Either
Text
[HyperdataDocument]
parseCsv' bs = (V.toList . V.map csv2doc . snd) <$> readCsvLazyBS Comma bs
-}
parseCsv'
::
BL
.
ByteString
->
Either
Prelude
.
String
[
HyperdataDocument
]
parseCsv'
::
BL
.
ByteString
->
Either
Text
[
HyperdataDocument
]
parseCsv'
bs
=
do
let
result
=
case
readCsvLazyBS
Comma
bs
of
...
...
@@ -464,15 +458,13 @@ parseCsv' bs = do
(
V
.
toList
.
V
.
map
csv2doc
.
snd
)
<$>
result
parseCsvC
::
BL
.
ByteString
->
Either
Prelude
.
String
(
Maybe
Integer
,
ConduitT
()
HyperdataDocument
Identity
()
)
parseCsvC
bs
=
do
let
result
=
case
readCsvLazyBS
Comma
bs
of
->
Either
Text
(
Integer
,
ConduitT
()
HyperdataDocument
Identity
()
)
parseCsvC
bs
=
(
\
(
_h
,
rs
)
->
(
fromIntegral
$
V
.
length
rs
,
yieldMany
rs
.|
mapC
csv2doc
))
<$>
eResult
where
eResult
=
case
readCsvLazyBS
Comma
bs
of
Left
_err
->
readCsvLazyBS
Tab
bs
Right
res
->
Right
res
case
result
of
Left
err
->
Left
err
Right
r
->
Right
(
Just
$
Prelude
.
fromIntegral
$
Prelude
.
length
$
snd
r
,
(
yieldMany
$
snd
r
)
.|
mapC
csv2doc
)
------------------------------------------------------------------------
-- Csv v3 weighted for phylo
...
...
src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
View file @
3dd27b5a
...
...
@@ -53,12 +53,11 @@ dateSplit Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
dateSplit
(
Just
txt
)
=
do
utcTime
<-
parse
txt
let
(
y
,
m
,
d
)
=
split'
utcTime
pure
(
Just
utcTime
,
(
Just
y
,
Just
m
,
Just
d
))
pure
(
Just
utcTime
,
(
Just
y
,
Just
m
,
Just
d
))
split'
::
UTCTime
->
(
Year
,
Month
,
Day
)
split'
utcTime
=
(
fromIntegral
y
,
m
,
d
)
split'
(
UTCTime
day
_
)
=
(
fromIntegral
y
,
m
,
d
)
where
(
UTCTime
day
_
)
=
utcTime
(
y
,
m
,
d
)
=
toGregorian
day
type
Year
=
Int
...
...
src/Gargantext/Core/Text/Corpus/Parsers/JSON.hs
View file @
3dd27b5a
...
...
@@ -15,18 +15,17 @@ JSON parser for Gargantext corpus files.
module
Gargantext.Core.Text.Corpus.Parsers.JSON
where
-- import Gargantext.Database.Schema.Node (NodePoly(..))
import
Conduit
import
Data.Aeson
import
qualified
Data.ByteString.Lazy
as
BL
import
Data.Either
(
Either
(
..
))
import
Data.Text
import
Data.ByteString.Lazy
qualified
as
BL
import
Data.Text
qualified
as
T
import
GHC.Generics
import
qualified
Prelude
import
Gargantext.Core
(
Lang
)
import
Gargantext.Core.Text.Corpus.Parsers.JSON.Istex
qualified
as
Istex
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
-- import Gargantext.Database.Schema.Node (NodePoly(..))
import
Gargantext.Prelude
hiding
(
length
)
import
Protolude
data
JSONStruct
=
...
...
@@ -62,18 +61,27 @@ instance FromJSON JSONNgrams
------------------------------------------------------------------------
-- | TODO: documents -> document -> hyperdata + title etc
readJSONLazyBS
::
BL
.
ByteString
->
Either
Prelude
.
String
JSONStruct
readJSONLazyBS
bs
=
eitherDecode
bs
readJSONLazyBS
::
(
FromJSON
a
)
=>
BL
.
ByteString
->
Either
Text
a
readJSONLazyBS
bs
=
first
T
.
pack
$
eitherDecode
bs
parseJSONC
::
BL
.
ByteString
->
Either
Prelude
.
String
(
Maybe
Integer
,
ConduitT
()
HyperdataDocument
Identity
()
)
parseJSONC
bs
=
do
case
readJSONLazyBS
bs
of
Left
err
->
Left
err
Right
(
JSONStruct
{
documents
})
->
Right
(
Just
$
Prelude
.
fromIntegral
$
Prelude
.
length
documents
,
yieldMany
documents
.|
mapC
doc2hyperdoc
)
->
Either
Text
(
Integer
,
ConduitT
()
HyperdataDocument
Identity
()
)
parseJSONC
bs
=
f
<$>
readJSONLazyBS
bs
where
f
(
JSONStruct
{
documents
})
=
(
fromIntegral
$
length
documents
,
yieldMany
documents
.|
mapC
doc2hyperdoc
)
doc2hyperdoc
::
JSONStructDocument
->
HyperdataDocument
doc2hyperdoc
(
JSONStructDocument
{
document
=
JSONDocument
{
hyperdata
}
})
=
hyperdata
parseIstex
::
Lang
->
BL
.
ByteString
->
IO
(
Either
Text
HyperdataDocument
)
parseIstex
l
bs
=
do
let
ej
=
readJSONLazyBS
bs
case
ej
of
Left
err
->
pure
$
Left
err
Right
j
->
Right
<$>
Istex
.
toDoc
l
j
src/Gargantext/Database/Action/Flow.hs
View file @
3dd27b5a
This diff is collapsed.
Click to expand it.
src/Gargantext/Utils/Aeson.hs
View file @
3dd27b5a
{-|
Module : Gargantext.Utils.Aeson
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Utilities for handling zip files
-}
module
Gargantext.Utils.Aeson
where
import
Data.Aeson.Types
...
...
src/Gargantext/Utils/Jobs.hs
View file @
3dd27b5a
{-|
Module : Gargantext.Utils.Jobs
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeFamilies #-}
module
Gargantext.Utils.Jobs
(
...
...
src/Gargantext/Utils/Servant.hs
View file @
3dd27b5a
{-|
Module : Gargantext.Utils.Servant
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module
Gargantext.Utils.Servant
where
import
qualified
Data.ByteString.Lazy.Char8
as
BSC
...
...
src/Gargantext/Utils/Tuple.hs
View file @
3dd27b5a
{-|
Module : Gargantext.Utils.Tuple
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
module
Gargantext.Utils.Tuple
where
import
Protolude
...
...
src/Gargantext/Utils/UTCTime.hs
View file @
3dd27b5a
{-|
Module : Gargantext.Utils.UTCTime
Description : Gargantext utilities
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
-}
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE TypeFamilies #-}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment