Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Christian Merten
haskell-gargantext
Commits
d63df339
Commit
d63df339
authored
Aug 05, 2021
by
Przemyslaw Kaminski
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[upload] fix csv upload job response
Now the job doesn't return 'succeeded' fields when it actually fails.
parent
8667dfeb
Changes
14
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
265 additions
and
220 deletions
+265
-220
Main.hs
bin/gargantext-adaptative-phylo/Main.hs
+58
-49
CleanCsvCorpus.hs
bin/gargantext-cli/CleanCsvCorpus.hs
+14
-11
Main.hs
bin/gargantext-cli/Main.hs
+26
-29
Main.hs
bin/gargantext-phylo/Main.hs
+10
-9
Types.hs
src/Gargantext/API/Admin/Orchestrator/Types.hs
+0
-12
Job.hs
src/Gargantext/API/Job.hs
+21
-16
New.hs
src/Gargantext/API/Node/Corpus/New.hs
+36
-20
FrameCalcUpload.hs
src/Gargantext/API/Node/FrameCalcUpload.hs
+4
-3
Routes.hs
src/Gargantext/API/Routes.hs
+3
-3
IMT.hs
src/Gargantext/Core/Ext/IMT.hs
+9
-4
Convert.hs
src/Gargantext/Core/Text/Convert.hs
+8
-2
Parsers.hs
src/Gargantext/Core/Text/Corpus/Parsers.hs
+33
-21
CSV.hs
src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
+34
-33
Flow.hs
src/Gargantext/Database/Action/Flow.hs
+9
-8
No files found.
bin/gargantext-adaptative-phylo/Main.hs
View file @
d63df339
...
...
@@ -16,11 +16,24 @@ Adaptative Phylo binaries
module
Main
where
import
Control.Concurrent.Async
(
mapConcurrently
)
import
Crypto.Hash.SHA256
(
hash
)
import
Data.Aeson
import
Data.Either
(
Either
(
..
))
import
Data.List
(
concat
,
nub
,
isSuffixOf
)
import
Data.String
(
String
)
import
GHC.IO
(
FilePath
)
import
qualified
Prelude
as
Prelude
import
System.Environment
import
System.Directory
(
listDirectory
,
doesFileExist
)
import
Data.Text
(
Text
,
unwords
,
unpack
,
replace
,
pack
)
import
Crypto.Hash.SHA256
(
hash
)
import
Data.Time.Calendar
(
fromGregorian
,
diffGregorianDurationClip
,
cdMonths
,
diffDays
,
showGregorian
)
import
qualified
Data.ByteString.Char8
as
C8
import
qualified
Data.ByteString.Lazy
as
Lazy
import
qualified
Data.Vector
as
Vector
import
qualified
Gargantext.Core.Text.Corpus.Parsers.CSV
as
Csv
import
qualified
Data.Text
as
T
import
Gargantext.Prelude
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
...
...
@@ -36,20 +49,6 @@ import Gargantext.Core.Viz.Phylo.PhyloTools (printIOMsg, printIOComment, setCon
import
Gargantext.Core.Viz.Phylo.PhyloExport
(
toPhyloExport
,
dotToFile
)
-- import Gargantext.API.Ngrams.Prelude (toTermList)
import
GHC.IO
(
FilePath
)
import
Prelude
(
Either
(
Left
,
Right
),
toInteger
)
import
System.Environment
import
System.Directory
(
listDirectory
,
doesFileExist
)
import
Control.Concurrent.Async
(
mapConcurrently
)
import
Data.Time.Calendar
(
fromGregorian
,
diffGregorianDurationClip
,
cdMonths
,
diffDays
,
showGregorian
)
import
qualified
Data.ByteString.Char8
as
C8
import
qualified
Data.ByteString.Lazy
as
Lazy
import
qualified
Data.Vector
as
Vector
import
qualified
Gargantext.Core.Text.Corpus.Parsers.CSV
as
Csv
import
qualified
Data.Text
as
T
-- import Debug.Trace (trace)
data
PhyloStage
=
PhyloWithCliques
|
PhyloWithLinks
deriving
(
Show
)
...
...
@@ -84,13 +83,13 @@ toDays y m d = fromIntegral
toPhyloDate
::
Int
->
Int
->
Int
->
TimeUnit
->
Date
toPhyloDate
y
m
d
tu
=
case
tu
of
Year
_
_
_
->
y
Month
_
_
_
->
toMonths
(
toInteger
y
)
m
d
Week
_
_
_
->
div
(
toDays
(
toInteger
y
)
m
d
)
7
Day
_
_
_
->
toDays
(
toInteger
y
)
m
d
Month
_
_
_
->
toMonths
(
Prelude
.
toInteger
y
)
m
d
Week
_
_
_
->
div
(
toDays
(
Prelude
.
toInteger
y
)
m
d
)
7
Day
_
_
_
->
toDays
(
Prelude
.
toInteger
y
)
m
d
toPhyloDate'
::
Int
->
Int
->
Int
->
Text
toPhyloDate'
y
m
d
=
pack
$
showGregorian
$
fromGregorian
(
toInteger
y
)
m
d
toPhyloDate'
y
m
d
=
pack
$
showGregorian
$
fromGregorian
(
Prelude
.
toInteger
y
)
m
d
--------------
...
...
@@ -113,9 +112,14 @@ termsInText pats txt = nub $ concat $ map (map unwords) $ extractTermsWithList p
-- | To transform a Wos file (or [file]) into a list of Docs
wosToDocs
::
Int
->
Patterns
->
TimeUnit
->
FilePath
->
IO
([
Document
])
wosToDocs
::
Int
->
Patterns
->
TimeUnit
->
FilePath
->
IO
[
Document
]
wosToDocs
limit
patterns
time
path
=
do
files
<-
getFilesFromPath
path
let
parseFile'
file
=
do
eParsed
<-
parseFile
WOS
(
path
<>
file
)
case
eParsed
of
Right
ps
->
pure
ps
Left
e
->
panic
$
"Error: "
<>
(
pack
e
)
take
limit
<$>
map
(
\
d
->
let
title
=
fromJust
$
_hd_title
d
abstr
=
if
(
isJust
$
_hd_abstract
d
)
...
...
@@ -134,22 +138,27 @@ wosToDocs limit patterns time path = do
<$>
mapConcurrently
(
\
file
->
filter
(
\
d
->
(
isJust
$
_hd_publication_year
d
)
&&
(
isJust
$
_hd_title
d
))
<$>
parseFile
WOS
(
path
<>
file
)
)
files
<$>
parseFile'
file
)
files
-- To transform a Csv file into a list of Document
csvToDocs
::
CorpusParser
->
Patterns
->
TimeUnit
->
FilePath
->
IO
([
Document
])
csvToDocs
::
CorpusParser
->
Patterns
->
TimeUnit
->
FilePath
->
IO
[
Document
]
csvToDocs
parser
patterns
time
path
=
case
parser
of
Wos
_
->
undefined
Csv
limit
->
Vector
.
toList
<$>
Vector
.
take
limit
<$>
Vector
.
map
(
\
row
->
Document
(
toPhyloDate
(
csv_publication_year
row
)
(
csv_publication_month
row
)
(
csv_publication_day
row
)
time
)
Csv
limit
->
do
eR
<-
Csv
.
readFile
path
case
eR
of
Right
r
->
pure
$
Vector
.
toList
$
Vector
.
take
limit
$
Vector
.
map
(
\
row
->
Document
(
toPhyloDate
(
csv_publication_year
row
)
(
csv_publication_month
row
)
(
csv_publication_day
row
)
time
)
(
toPhyloDate'
(
csv_publication_year
row
)
(
csv_publication_month
row
)
(
csv_publication_day
row
))
(
termsInText
patterns
$
(
csv_title
row
)
<>
" "
<>
(
csv_abstract
row
))
Nothing
[]
)
<$>
snd
<$>
Csv
.
readFile
path
)
$
snd
r
Left
e
->
panic
$
"Error: "
<>
(
pack
e
)
Csv'
limit
->
Vector
.
toList
<$>
Vector
.
take
limit
<$>
Vector
.
map
(
\
row
->
Document
(
toPhyloDate
(
csv'_publication_year
row
)
(
csv'_publication_month
row
)
(
csv'_publication_day
row
)
time
)
...
...
bin/gargantext-cli/CleanCsvCorpus.hs
View file @
d63df339
...
...
@@ -15,6 +15,7 @@ compress the contexts around the main terms of the query.
module
CleanCsvCorpus
where
--import GHC.IO (FilePath)
import
Data.Either
(
Either
(
..
))
import
Data.SearchEngine
as
S
import
qualified
Data.Set
as
S
import
Data.Text
(
pack
)
...
...
@@ -39,8 +40,9 @@ main = do
--let q = ["water", "scarcity", "morocco", "shortage","flood"]
let
q
=
[
"gratuit"
,
"gratuité"
,
"culture"
,
"culturel"
]
(
h
,
csvDocs
)
<-
CSV
.
readFile
rPath
eDocs
<-
CSV
.
readFile
rPath
case
eDocs
of
Right
(
h
,
csvDocs
)
->
do
putStrLn
$
"Number of documents before:"
<>
show
(
V
.
length
csvDocs
)
putStrLn
$
"Mean size of docs:"
<>
show
(
CSV
.
docsSize
csvDocs
)
...
...
@@ -53,3 +55,4 @@ main = do
putStrLn
$
"Mean size of docs:"
<>
show
(
CSV
.
docsSize
docs'
)
CSV
.
writeFile
wPath
(
h
,
docs'
)
Left
e
->
panic
$
"Error: "
<>
(
pack
e
)
bin/gargantext-cli/Main.hs
View file @
d63df339
...
...
@@ -17,30 +17,24 @@ Main specifications to index a corpus with a term list
module
Main
where
import
Data.ByteString.Lazy
(
writeFile
)
import
Data.Text
(
pack
)
import
qualified
Data.Text
as
DT
import
Data.Tuple.Extra
(
both
)
import
qualified
Data.Vector
as
DV
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Control.Concurrent
(
getNumCapabilities
,
myThreadId
,
threadCapability
)
import
Control.Monad
(
zipWithM
)
import
Control.Monad.IO.Class
import
Data.Map
(
Map
)
import
qualified
Data.Map
as
DM
import
GHC.Generics
import
Data.Aeson
import
Data.
Text
(
Text
)
import
Data.ByteString.Lazy
(
writeFile
)
import
Data.
Either
(
Either
(
..
)
)
import
Data.List
(
cycle
,
concat
,
unwords
)
import
Data.List.Split
(
chunksOf
)
import
Data.Map
(
Map
)
import
qualified
Data.Map
as
DM
import
Data.Text
(
pack
,
Text
)
import
qualified
Data.Text
as
DT
import
Data.Tuple.Extra
(
both
)
import
qualified
Data.Vector
as
DV
import
GHC.Generics
import
System.IO
(
hPutStr
,
hFlush
,
stderr
)
import
System.Environment
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Control.Concurrent
(
getNumCapabilities
,
myThreadId
,
threadCapability
)
import
Gargantext.Prelude
import
Gargantext.Core
...
...
@@ -92,11 +86,13 @@ main = do
[
corpusFile
,
termListFile
,
outputFile
]
<-
getArgs
--corpus :: IO (DM.IntMap [[Text]])
corpus
<-
DM
.
fromListWith
(
<>
)
eCorpusFile
<-
readFile
corpusFile
case
eCorpusFile
of
Right
cf
->
do
let
corpus
=
DM
.
fromListWith
(
<>
)
.
DV
.
toList
.
DV
.
map
(
\
n
->
(
csv_publication_year
n
,
[(
csv_title
n
)
<>
" "
<>
(
csv_abstract
n
)]))
.
snd
<$>
readFile
corpusFile
.
snd
$
cf
-- termListMap :: [Text]
termList
<-
csvMapTermList
termListFile
...
...
@@ -108,6 +104,7 @@ main = do
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r
<-
mapConcurrently
(
filterTermsAndCooc
patterns
)
(
DM
.
toList
corpus
)
writeFile
outputFile
$
encode
(
CoocByYears
r
)
Left
e
->
panic
$
"Error: "
<>
(
pack
e
)
...
...
bin/gargantext-phylo/Main.hs
View file @
d63df339
...
...
@@ -24,6 +24,16 @@ import Data.Maybe
import
Data.Text
(
Text
,
unwords
)
import
GHC.Generics
import
GHC.IO
(
FilePath
)
import
System.Directory
(
doesFileExist
)
import
System.Environment
import
qualified
Data.ByteString.Lazy
as
L
import
qualified
Data.List
as
DL
import
qualified
Data.Map
as
DM
import
qualified
Data.Text
as
DT
import
qualified
Data.Vector
as
DV
import
qualified
Gargantext.Core.Text.Corpus.Parsers.CSV
as
CSV
import
qualified
Prelude
as
P
import
Gargantext.Database.Admin.Types.Hyperdata
import
Gargantext.Prelude
import
Gargantext.Core.Text.Context
(
TermList
)
...
...
@@ -36,15 +46,6 @@ import Gargantext.Core.Viz.Phylo.LevelMaker
import
Gargantext.Core.Viz.Phylo.Tools
import
Gargantext.Core.Viz.Phylo.View.Export
import
Gargantext.Core.Viz.Phylo.View.ViewMaker
import
System.Directory
(
doesFileExist
)
import
System.Environment
import
qualified
Data.ByteString.Lazy
as
L
import
qualified
Data.List
as
DL
import
qualified
Data.Map
as
DM
import
qualified
Data.Text
as
DT
import
qualified
Data.Vector
as
DV
import
qualified
Gargantext.Core.Text.Corpus.Parsers.CSV
as
CSV
import
qualified
Prelude
as
P
--------------
...
...
src/Gargantext/API/Admin/Orchestrator/Types.hs
View file @
d63df339
...
...
@@ -131,18 +131,6 @@ type ScrapersEnv = JobEnv JobLog JobLog
type
ScraperAPI
=
AsyncJobsAPI
JobLog
ScraperInput
JobLog
jobLogInit
::
Int
->
JobLog
jobLogInit
n
=
JobLog
{
_scst_succeeded
=
Just
n
,
_scst_failed
=
Just
0
,
_scst_remaining
=
Just
0
,
_scst_events
=
Just
[]
}
jobLogSucc
::
JobLog
->
JobLog
jobLogSucc
jl
=
over
(
scst_succeeded
.
_Just
)
(
+
1
)
$
over
(
scst_remaining
.
_Just
)
(
\
c
->
c
-
1
)
jl
jobLogErr
::
JobLog
->
JobLog
jobLogErr
jl
=
over
(
scst_failed
.
_Just
)
(
+
1
)
$
over
(
scst_remaining
.
_Just
)
(
\
c
->
c
-
1
)
jl
------------------------------------------------------------------------
type
AsyncJobs
event
ctI
input
output
=
AsyncJobsAPI'
'U
n
safe
'S
a
fe
ctI
'[
J
SON
]
Maybe
event
input
output
src/Gargantext/API/Job.hs
View file @
d63df339
module
Gargantext.API.Job
where
import
Control.Lens
(
over
,
_Just
)
import
Data.IORef
import
Data.Maybe
import
Gargantext.Prelude
import
Gargantext.API.Admin.Orchestrator.Types
(
JobLog
(
..
))
import
Gargantext.API.Admin.Orchestrator.Types
jobLogInit
::
Int
->
JobLog
...
...
@@ -16,25 +17,29 @@ jobLogInit rem =
,
_scst_events
=
Just
[]
}
jobLogSuccess
::
JobLog
->
JobLog
jobLogSuccess
(
JobLog
{
_scst_succeeded
=
mSucc
,
_scst_remaining
=
mRem
,
_scst_failed
=
mFail
,
_scst_events
=
evt
})
=
JobLog
{
_scst_succeeded
=
(
+
1
)
<$>
mSucc
,
_scst_remaining
=
(
\
x
->
x
-
1
)
<$>
mRem
,
_scst_failed
=
mFail
,
_scst_events
=
evt
}
jobLogSuccess
jl
=
over
(
scst_succeeded
.
_Just
)
(
+
1
)
$
over
(
scst_remaining
.
_Just
)
(
\
x
->
x
-
1
)
jl
jobLogFail
::
JobLog
->
JobLog
jobLogFail
(
JobLog
{
_scst_succeeded
=
mSucc
jobLogFail
jl
=
over
(
scst_failed
.
_Just
)
(
+
1
)
$
over
(
scst_remaining
.
_Just
)
(
\
x
->
x
-
1
)
jl
jobLogFailTotal
::
JobLog
->
JobLog
jobLogFailTotal
(
JobLog
{
_scst_succeeded
=
mSucc
,
_scst_remaining
=
mRem
,
_scst_failed
=
mFail
,
_scst_events
=
evt
})
=
JobLog
{
_scst_succeeded
=
mSucc
,
_scst_remaining
=
(
\
x
->
x
-
1
)
<$>
m
Rem
,
_scst_failed
=
(
+
1
)
<$>
m
Fail
,
_scst_remaining
=
new
Rem
,
_scst_failed
=
new
Fail
,
_scst_events
=
evt
}
where
(
newRem
,
newFail
)
=
case
mRem
of
Nothing
->
(
Nothing
,
mFail
)
Just
rem
->
(
Just
0
,
(
+
rem
)
<$>
mFail
)
jobLogEvt
::
JobLog
->
ScraperEvent
->
JobLog
jobLogEvt
jl
evt
=
over
(
scst_events
.
_Just
)
(
\
evts
->
(
evt
:
evts
))
jl
runJobLog
::
MonadBase
IO
m
=>
Int
->
(
JobLog
->
m
()
)
->
m
(
m
()
,
m
()
,
m
JobLog
)
runJobLog
num
logStatus
=
do
...
...
src/Gargantext/API/Node/Corpus/New.hs
View file @
d63df339
...
...
@@ -27,6 +27,8 @@ import Data.Swagger
import
Data.Text
(
Text
)
import
qualified
Data.Text
as
T
import
GHC.Generics
(
Generic
)
import
qualified
Prelude
as
Prelude
import
Protolude
(
readFile
)
import
Servant
import
Servant.Job.Utils
(
jsonOptions
)
-- import Servant.Multipart
...
...
@@ -35,8 +37,9 @@ import Test.QuickCheck.Arbitrary
import
Gargantext.Prelude
import
Gargantext.API.Admin.Orchestrator.Types
(
JobLog
(
..
),
AsyncJobs
,
jobLogSucc
)
import
Gargantext.API.Admin.Orchestrator.Types
(
JobLog
(
..
),
AsyncJobs
)
import
Gargantext.API.Admin.Types
(
HasSettings
)
import
Gargantext.API.Job
(
jobLogSuccess
,
jobLogFailTotal
)
import
Gargantext.API.Node.Corpus.New.File
import
Gargantext.API.Node.Corpus.Searx
import
Gargantext.API.Node.Corpus.Types
...
...
@@ -248,9 +251,10 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
PresseRIS
->
Parser
.
parseFormat
Parser
.
RisPresse
-- TODO granularity of the logStatus
docs
<-
liftBase
$
splitEvery
500
<$>
take
1000000
<$>
parse
(
cs
d
)
eDocs
<-
liftBase
$
parse
$
cs
d
case
eDocs
of
Right
docs'
->
do
let
docs
=
splitEvery
500
$
take
1000000
docs'
printDebug
"Parsing corpus finished : "
cid
logStatus
jobLog2
...
...
@@ -266,10 +270,22 @@ addToCorpusWithForm user cid (NewWithForm ft d l _n) logStatus jobLog = do
printDebug
"sending email"
(
"xxxxxxxxxxxxxxxxxxxxx"
::
Text
)
sendMail
user
pure
jobLog3
logStatus
jobLog3
pure
$
jobLog3
Left
e
->
do
printDebug
"Error"
e
logStatus
jobLogE
pure
jobLogE
where
jobLog2
=
jobLogSucc
jobLog
jobLog3
=
jobLogSucc
jobLog2
jobLog2
=
jobLogSuccess
jobLog
jobLog3
=
jobLogSuccess
jobLog2
jobLogE
=
jobLogFailTotal
jobLog
parseCsvGargV3Path
::
[
Char
]
->
IO
(
Either
Prelude
.
String
[
HyperdataDocument
])
parseCsvGargV3Path
fp
=
do
contents
<-
readFile
fp
Parser
.
parseFormat
Parser
.
CsvGargV3
$
cs
contents
{-
addToCorpusWithFile :: FlowCmdM env err m
...
...
src/Gargantext/API/Node/FrameCalcUpload.hs
View file @
d63df339
...
...
@@ -17,7 +17,8 @@ import Servant
import
Servant.Job.Async
import
Web.FormUrlEncoded
(
FromForm
)
import
Gargantext.API.Admin.Orchestrator.Types
(
JobLog
(
..
),
AsyncJobs
,
jobLogInit
,
jobLogSucc
,
jobLogErr
)
import
Gargantext.API.Admin.Orchestrator.Types
(
JobLog
(
..
),
AsyncJobs
)
import
Gargantext.API.Job
(
jobLogInit
,
jobLogSuccess
,
jobLogFail
)
import
Gargantext.API.Node.Corpus.New
(
addToCorpusWithForm
)
import
Gargantext.API.Node.Corpus.New.File
(
FileType
(
..
))
import
Gargantext.API.Node.Types
(
NewWithForm
(
..
))
...
...
@@ -82,8 +83,8 @@ frameCalcUploadAsync uId nId _f logStatus jobLog = do
-- printDebug "[frameCalcUploadAsync] mCId" mCId
jobLog2
<-
case
mCId
of
Nothing
->
pure
$
jobLog
Err
jobLog
Nothing
->
pure
$
jobLog
Fail
jobLog
Just
cId
->
addToCorpusWithForm
(
RootId
(
NodeId
uId
))
cId
(
NewWithForm
CSV
body
Nothing
"calc-upload.csv"
)
logStatus
jobLog
pure
$
jobLogSucc
jobLog2
pure
$
jobLogSucc
ess
jobLog2
src/Gargantext/API/Routes.hs
View file @
d63df339
...
...
@@ -43,8 +43,8 @@ import qualified Gargantext.API.Public as Public
import
Gargantext.API.Admin.Auth.Types
(
AuthRequest
,
AuthResponse
,
AuthenticatedUser
(
..
),
PathId
(
..
))
import
Gargantext.API.Admin.Auth
(
withAccess
)
import
Gargantext.API.Admin.FrontEnd
(
FrontEndAPI
)
import
Gargantext.API.Admin.Orchestrator.Types
(
jobLogInit
)
import
Gargantext.API.Count
(
CountAPI
,
count
,
Query
)
import
Gargantext.API.Job
(
jobLogInit
)
import
Gargantext.API.Ngrams
(
TableNgramsApi
,
apiNgramsTableDoc
)
import
Gargantext.API.Node
import
Gargantext.API.Prelude
...
...
@@ -285,7 +285,7 @@ addCorpusWithForm user cid =
JobFunction
(
\
i
log'
->
let
log''
x
=
do
printDebug
"
addToCorpusWithForm
"
x
printDebug
"
[addToCorpusWithForm]
"
x
liftBase
$
log'
x
in
New
.
addToCorpusWithForm
user
cid
i
log''
(
jobLogInit
3
))
...
...
@@ -295,7 +295,7 @@ addCorpusWithFile user cid =
JobFunction
(
\
i
log'
->
let
log''
x
=
do
printDebug
"
addToCorpusWithFile
"
x
printDebug
"
[addToCorpusWithFile]
"
x
liftBase
$
log'
x
in
New
.
addToCorpusWithFile
user
cid
i
log''
)
...
...
src/Gargantext/Core/Ext/IMT.hs
View file @
d63df339
...
...
@@ -11,14 +11,17 @@ Portability : POSIX
module
Gargantext.Core.Ext.IMT
where
import
Gargantext.Prelude
import
Data.Text
(
Text
,
splitOn
)
import
Data.Either
(
Either
(
..
))
import
Data.Map
(
Map
)
import
Data.Text
(
Text
,
splitOn
)
import
qualified
Data.Set
as
S
import
qualified
Data.List
as
DL
import
qualified
Data.Vector
as
DV
import
qualified
Data.Map
as
M
import
qualified
Prelude
as
Prelude
import
Gargantext.Prelude
import
Gargantext.Core.Text.Metrics.Utils
as
Utils
import
Gargantext.Core.Text.Corpus.Parsers.CSV
as
CSV
...
...
@@ -98,8 +101,10 @@ schools = [ School
mapIdSchool
::
Map
Text
Text
mapIdSchool
=
M
.
fromList
$
Gargantext
.
Prelude
.
map
(
\
(
School
n
_
i
)
->
(
i
,
n
))
schools
hal_data
::
IO
(
DV
.
Vector
CsvHal
)
hal_data
=
snd
<$>
CSV
.
readCsvHal
"doc/corpus_imt/Gargantext_Corpus.csv"
hal_data
::
IO
(
Either
Prelude
.
String
(
DV
.
Vector
CsvHal
))
hal_data
=
do
r
<-
CSV
.
readCsvHal
"doc/corpus_imt/Gargantext_Corpus.csv"
pure
$
snd
<$>
r
names
::
S
.
Set
Text
names
=
S
.
fromList
$
Gargantext
.
Prelude
.
map
(
\
s
->
school_id
s
)
schools
...
...
src/Gargantext/Core/Text/Convert.hs
View file @
d63df339
...
...
@@ -16,15 +16,21 @@ Format Converter.
module
Gargantext.Core.Text.Convert
(
risPress2csvWrite
)
where
import
Data.Either
(
Either
(
..
))
import
qualified
Data.Text
as
T
import
System.FilePath
(
FilePath
())
-- , takeExtension)
import
Gargantext.Prelude
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
writeDocs2Csv
)
import
Gargantext.Core.Text.Corpus.Parsers
(
parseFile
,
FileFormat
(
..
))
risPress2csvWrite
::
FilePath
->
IO
()
risPress2csvWrite
f
=
parseFile
RisPresse
(
f
<>
".ris"
)
>>=
\
hs
->
writeDocs2Csv
(
f
<>
".csv"
)
hs
risPress2csvWrite
f
=
do
eContents
<-
parseFile
RisPresse
(
f
<>
".ris"
)
case
eContents
of
Right
contents
->
writeDocs2Csv
(
f
<>
".csv"
)
contents
Left
e
->
panic
$
"Error: "
<>
(
T
.
pack
e
)
src/Gargantext/Core/Text/Corpus/Parsers.hs
View file @
d63df339
...
...
@@ -36,18 +36,20 @@ import Data.String()
import
Data.Text
(
Text
)
import
Data.Text.Encoding
(
decodeUtf8
)
import
Data.Tuple.Extra
(
both
,
first
,
second
)
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
parseHal
,
parseHal'
,
parseCsv
,
parseCsv'
)
import
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
(
presseEnrich
)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import
System.FilePath
(
FilePath
(),
takeExtension
)
import
qualified
Data.ByteString
as
DB
import
qualified
Data.ByteString.Char8
as
DBC
import
qualified
Data.ByteString.Lazy
as
DBL
import
qualified
Data.Map
as
DM
import
qualified
Data.Text
as
DT
import
qualified
Prelude
as
Prelude
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
import
Gargantext.Core.Text.Corpus.Parsers.CSV
(
parseHal
,
parseHal'
,
parseCsv
,
parseCsv'
)
import
Gargantext.Core.Text.Corpus.Parsers.RIS.Presse
(
presseEnrich
)
-- import Gargantext.Core.Text.Learn (detectLangDefault)
import
qualified
Gargantext.Core.Text.Corpus.Parsers.Date
as
Date
import
qualified
Gargantext.Core.Text.Corpus.Parsers.RIS
as
RIS
import
qualified
Gargantext.Core.Text.Corpus.Parsers.WOS
as
WOS
...
...
@@ -75,30 +77,40 @@ data FileFormat = WOS | RIS | RisPresse
-- | XML -- Not Implemented / see :
parseFormat
::
FileFormat
->
DB
.
ByteString
->
IO
[
HyperdataDocument
]
parseFormat
::
FileFormat
->
DB
.
ByteString
->
IO
(
Either
Prelude
.
String
[
HyperdataDocument
])
parseFormat
CsvGargV3
bs
=
pure
$
parseCsv'
$
DBL
.
fromStrict
bs
parseFormat
CsvHal
bs
=
pure
$
parseHal'
$
DBL
.
fromStrict
bs
parseFormat
RisPresse
bs
=
mapM
(
toDoc
RIS
)
parseFormat
RisPresse
bs
=
do
docs
<-
mapM
(
toDoc
RIS
)
<$>
snd
<$>
enrichWith
RisPresse
$
partitionEithers
$
[
runParser'
RisPresse
bs
]
parseFormat
WOS
bs
=
mapM
(
toDoc
WOS
)
pure
$
Right
docs
parseFormat
WOS
bs
=
do
docs
<-
mapM
(
toDoc
WOS
)
<$>
snd
<$>
enrichWith
WOS
$
partitionEithers
$
[
runParser'
WOS
bs
]
pure
$
Right
docs
parseFormat
_
_
=
undefined
-- | Parse file into documents
-- TODO manage errors here
-- TODO: to debug maybe add the filepath in error message
parseFile
::
FileFormat
->
FilePath
->
IO
[
HyperdataDocument
]
parseFile
::
FileFormat
->
FilePath
->
IO
(
Either
Prelude
.
String
[
HyperdataDocument
])
parseFile
CsvHal
p
=
parseHal
p
parseFile
CsvGargV3
p
=
parseCsv
p
parseFile
RisPresse
p
=
join
$
mapM
(
toDoc
RIS
)
<$>
snd
<$>
enrichWith
RisPresse
<$>
readFileWith
RIS
p
parseFile
WOS
p
=
join
$
mapM
(
toDoc
WOS
)
<$>
snd
<$>
enrichWith
WOS
<$>
readFileWith
WOS
p
parseFile
ff
p
=
join
$
mapM
(
toDoc
ff
)
<$>
snd
<$>
enrichWith
ff
<$>
readFileWith
ff
p
parseFile
RisPresse
p
=
do
docs
<-
join
$
mapM
(
toDoc
RIS
)
<$>
snd
<$>
enrichWith
RisPresse
<$>
readFileWith
RIS
p
pure
$
Right
docs
parseFile
WOS
p
=
do
docs
<-
join
$
mapM
(
toDoc
WOS
)
<$>
snd
<$>
enrichWith
WOS
<$>
readFileWith
WOS
p
pure
$
Right
docs
parseFile
ff
p
=
do
docs
<-
join
$
mapM
(
toDoc
ff
)
<$>
snd
<$>
enrichWith
ff
<$>
readFileWith
ff
p
pure
$
Right
docs
toDoc
::
FileFormat
->
[(
Text
,
Text
)]
->
IO
HyperdataDocument
-- TODO use language for RIS
...
...
src/Gargantext/Core/Text/Corpus/Parsers/CSV.hs
View file @
d63df339
...
...
@@ -19,7 +19,7 @@ import qualified Data.ByteString as BS
import
qualified
Data.ByteString.Lazy
as
BL
import
Data.Char
(
ord
)
import
Data.Csv
import
Data.Either
(
Either
(
Left
,
Right
))
import
Data.Either
(
Either
(
..
))
import
Data.Text
(
Text
,
pack
,
length
,
intercalate
)
import
Data.Time.Segment
(
jour
)
import
qualified
Data.Vector
as
V
...
...
@@ -27,6 +27,8 @@ import Data.Vector (Vector)
import
GHC.IO
(
FilePath
)
import
GHC.Word
(
Word8
)
import
qualified
Prelude
as
Prelude
import
Gargantext.Database.Admin.Types.Hyperdata
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
hiding
(
length
)
import
Gargantext.Core.Text
...
...
@@ -192,52 +194,47 @@ delimiter :: Word8
delimiter
=
fromIntegral
$
ord
'
\t
'
------------------------------------------------------------------------
------------------------------------------------------------------------
readCsvOn'
::
[
CsvDoc
->
Text
]
->
FilePath
->
IO
[
Text
]
readCsvOn'
fields
fp
=
V
.
toList
<$>
V
.
map
(
\
l
->
intercalate
(
pack
" "
)
$
map
(
\
field
->
field
l
)
fields
)
<$>
snd
<$>
readFile
fp
readCsvOn'
::
[
CsvDoc
->
Text
]
->
FilePath
->
IO
(
Either
Prelude
.
String
[
Text
])
readCsvOn'
fields
fp
=
do
r
<-
readFile
fp
pure
$
(
V
.
toList
.
V
.
map
(
\
l
->
intercalate
(
pack
" "
)
$
map
(
\
field
->
field
l
)
fields
)
.
snd
)
<$>
r
------------------------------------------------------------------------
readFileLazy
::
(
FromNamedRecord
a
)
=>
proxy
a
->
FilePath
->
IO
(
Header
,
Vector
a
)
readFileLazy
::
(
FromNamedRecord
a
)
=>
proxy
a
->
FilePath
->
IO
(
Either
Prelude
.
String
(
Header
,
Vector
a
)
)
readFileLazy
f
=
fmap
(
readByteStringLazy
f
)
.
BL
.
readFile
readFileStrict
::
(
FromNamedRecord
a
)
=>
proxy
a
->
FilePath
->
IO
(
Header
,
Vector
a
)
readFileStrict
::
(
FromNamedRecord
a
)
=>
proxy
a
->
FilePath
->
IO
(
Either
Prelude
.
String
(
Header
,
Vector
a
)
)
readFileStrict
f
=
fmap
(
readByteStringStrict
f
)
.
BS
.
readFile
readByteStringLazy
::
(
FromNamedRecord
a
)
=>
proxy
a
->
BL
.
ByteString
->
(
Header
,
Vector
a
)
readByteStringLazy
_f
bs
=
case
decodeByNameWith
csvDecodeOptions
bs
of
Left
e
->
panic
(
pack
e
)
Right
csvDocs
->
csvDocs
readByteStringLazy
::
(
FromNamedRecord
a
)
=>
proxy
a
->
BL
.
ByteString
->
Either
Prelude
.
String
(
Header
,
Vector
a
)
readByteStringLazy
_f
bs
=
decodeByNameWith
csvDecodeOptions
bs
readByteStringStrict
::
(
FromNamedRecord
a
)
=>
proxy
a
->
BS
.
ByteString
->
(
Header
,
Vector
a
)
readByteStringStrict
::
(
FromNamedRecord
a
)
=>
proxy
a
->
BS
.
ByteString
->
Either
Prelude
.
String
(
Header
,
Vector
a
)
readByteStringStrict
ff
=
(
readByteStringLazy
ff
)
.
BL
.
fromStrict
------------------------------------------------------------------------
-- | TODO use readFileLazy
readFile
::
FilePath
->
IO
(
Header
,
Vector
CsvDoc
)
readFile
::
FilePath
->
IO
(
Either
Prelude
.
String
(
Header
,
Vector
CsvDoc
)
)
readFile
=
fmap
readCsvLazyBS
.
BL
.
readFile
-- | TODO use readByteStringLazy
readCsvLazyBS
::
BL
.
ByteString
->
(
Header
,
Vector
CsvDoc
)
readCsvLazyBS
bs
=
case
decodeByNameWith
csvDecodeOptions
bs
of
Left
e
->
panic
(
pack
e
)
Right
csvDocs
->
csvDocs
readCsvLazyBS
::
BL
.
ByteString
->
Either
Prelude
.
String
(
Header
,
Vector
CsvDoc
)
readCsvLazyBS
bs
=
decodeByNameWith
csvDecodeOptions
bs
------------------------------------------------------------------------
-- | TODO use readFileLazy
readCsvHal
::
FilePath
->
IO
(
Header
,
Vector
CsvHal
)
readCsvHal
::
FilePath
->
IO
(
Either
Prelude
.
String
(
Header
,
Vector
CsvHal
)
)
readCsvHal
=
fmap
readCsvHalLazyBS
.
BL
.
readFile
-- | TODO use readByteStringLazy
readCsvHalLazyBS
::
BL
.
ByteString
->
(
Header
,
Vector
CsvHal
)
readCsvHalLazyBS
bs
=
case
decodeByNameWith
csvDecodeOptions
bs
of
Left
e
->
panic
(
pack
e
)
Right
csvDocs
->
csvDocs
readCsvHalLazyBS
::
BL
.
ByteString
->
Either
Prelude
.
String
(
Header
,
Vector
CsvHal
)
readCsvHalLazyBS
bs
=
decodeByNameWith
csvDecodeOptions
bs
readCsvHalBSStrict
::
BS
.
ByteString
->
(
Header
,
Vector
CsvHal
)
readCsvHalBSStrict
::
BS
.
ByteString
->
Either
Prelude
.
String
(
Header
,
Vector
CsvHal
)
readCsvHalBSStrict
=
readCsvHalLazyBS
.
BL
.
fromStrict
------------------------------------------------------------------------
...
...
@@ -382,18 +379,22 @@ csv2doc (CsvDoc title source
Nothing
------------------------------------------------------------------------
parseHal
::
FilePath
->
IO
[
HyperdataDocument
]
parseHal
fp
=
V
.
toList
<$>
V
.
map
csvHal2doc
<$>
snd
<$>
readCsvHal
fp
parseHal
::
FilePath
->
IO
(
Either
Prelude
.
String
[
HyperdataDocument
])
parseHal
fp
=
do
r
<-
readCsvHal
fp
pure
$
(
V
.
toList
.
V
.
map
csvHal2doc
.
snd
)
<$>
r
parseHal'
::
BL
.
ByteString
->
[
HyperdataDocument
]
parseHal'
=
V
.
toList
.
V
.
map
csvHal2doc
.
snd
.
readCsvHalLazyBS
parseHal'
::
BL
.
ByteString
->
Either
Prelude
.
String
[
HyperdataDocument
]
parseHal'
bs
=
(
V
.
toList
.
V
.
map
csvHal2doc
.
snd
)
<$>
readCsvHalLazyBS
bs
------------------------------------------------------------------------
parseCsv
::
FilePath
->
IO
[
HyperdataDocument
]
parseCsv
fp
=
V
.
toList
<$>
V
.
map
csv2doc
<$>
snd
<$>
readFile
fp
parseCsv
::
FilePath
->
IO
(
Either
Prelude
.
String
[
HyperdataDocument
])
parseCsv
fp
=
do
r
<-
readFile
fp
pure
$
(
V
.
toList
.
V
.
map
csv2doc
.
snd
)
<$>
r
parseCsv'
::
BL
.
ByteString
->
[
HyperdataDocument
]
parseCsv'
bs
=
V
.
toList
$
V
.
map
csv2doc
$
snd
$
readCsvLazyBS
bs
parseCsv'
::
BL
.
ByteString
->
Either
Prelude
.
String
[
HyperdataDocument
]
parseCsv'
bs
=
(
V
.
toList
.
V
.
map
csv2doc
.
snd
)
<$>
readCsvLazyBS
bs
------------------------------------------------------------------------
-- Csv v3 weighted for phylo
...
...
src/Gargantext/Database/Action/Flow.hs
View file @
d63df339
...
...
@@ -56,7 +56,7 @@ import Data.Map (Map, lookup)
import
Data.Maybe
(
catMaybes
)
import
Data.Monoid
import
Data.Swagger
import
Data.Text
(
splitOn
)
import
qualified
Data.Text
as
T
import
Data.Traversable
(
traverse
)
import
Data.Tuple.Extra
(
first
,
second
)
import
GHC.Generics
(
Generic
)
...
...
@@ -178,11 +178,12 @@ flowCorpusFile :: (FlowCmdM env err m)
->
TermType
Lang
->
FileFormat
->
FilePath
->
m
CorpusId
flowCorpusFile
u
n
l
la
ff
fp
=
do
docs
<-
liftBase
(
splitEvery
500
<$>
take
l
<$>
parseFile
ff
fp
)
eParsed
<-
liftBase
$
parseFile
ff
fp
case
eParsed
of
Right
parsed
->
do
let
docs
=
splitEvery
500
$
take
l
parsed
flowCorpus
u
n
la
(
map
(
map
toHyperdataDocument
)
docs
)
Left
e
->
panic
$
"Error: "
<>
(
T
.
pack
e
)
------------------------------------------------------------------------
-- | TODO improve the needed type to create/update a corpus
...
...
@@ -425,11 +426,11 @@ instance ExtractNgramsT HyperdataDocument
$
_hd_source
doc
institutes
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
map
toSchoolName
.
(
splitOn
", "
))
$
maybe
[
"Nothing"
]
(
map
toSchoolName
.
(
T
.
splitOn
", "
))
$
_hd_institutes
doc
authors
=
map
text2ngrams
$
maybe
[
"Nothing"
]
(
splitOn
", "
)
$
maybe
[
"Nothing"
]
(
T
.
splitOn
", "
)
$
_hd_authors
doc
terms'
<-
map
(
enrichedTerms
(
lang'
^.
tt_lang
)
CoreNLP
NP
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment