Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
153
Issues
153
List
Board
Labels
Milestones
Merge Requests
12
Merge Requests
12
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
5da469ed
Commit
5da469ed
authored
Jan 27, 2020
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[UPLOAD] 2 others file format
parent
e4cbfa19
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
51 additions
and
34 deletions
+51
-34
Main.hs
bin/gargantext-import/Main.hs
+2
-2
New.hs
src/Gargantext/API/Corpus/New.hs
+9
-8
Parsers.hs
src/Gargantext/Text/Corpus/Parsers.hs
+38
-22
CSV.hs
src/Gargantext/Text/Corpus/Parsers/CSV.hs
+2
-2
No files found.
bin/gargantext-import/Main.hs
View file @
5da469ed
...
...
@@ -51,12 +51,12 @@ main = do
let
--tt = (Unsupervised EN 6 0 Nothing)
tt
=
(
Multi
EN
)
format
=
CsvGargV3
-- CsvHal
Format
--WOS
format
=
CsvGargV3
-- CsvHal --WOS
corpus
::
forall
m
.
FlowCmdM
DevEnv
GargError
m
=>
m
CorpusId
corpus
=
flowCorpusFile
(
cs
user
)
(
Left
(
cs
name
::
Text
))
(
read
limit
::
Int
)
tt
format
corpusPath
corpusCsvHal
::
forall
m
.
FlowCmdM
DevEnv
GargError
m
=>
m
CorpusId
corpusCsvHal
=
flowCorpusFile
(
cs
user
)
(
Left
(
cs
name
::
Text
))
(
read
limit
::
Int
)
tt
CsvHal
Format
corpusPath
corpusCsvHal
=
flowCorpusFile
(
cs
user
)
(
Left
(
cs
name
::
Text
))
(
read
limit
::
Int
)
tt
CsvHal
corpusPath
annuaire
::
forall
m
.
FlowCmdM
DevEnv
GargError
m
=>
m
CorpusId
annuaire
=
flowAnnuaire
(
cs
user
)
(
Left
"Annuaire"
)
(
Multi
EN
)
corpusPath
...
...
src/Gargantext/API/Corpus/New.hs
View file @
5da469ed
...
...
@@ -43,7 +43,7 @@ import Gargantext.Database.Types.Node (CorpusId)
import
Gargantext.Database.Types.Node
(
ToHyperdataDocument
(
..
))
import
Gargantext.Database.Types.Node
(
UserId
)
import
Gargantext.Prelude
import
Gargantext.Text.Corpus.Parsers
.CSV
(
parseCsv'
,
parseHal'
)
import
Gargantext.Text.Corpus.Parsers
(
FileFormat
(
..
),
parseFormat
)
import
Gargantext.Text.Terms
(
TermType
(
..
))
import
Servant
import
Servant.API.Flatten
(
Flat
)
...
...
@@ -227,13 +227,14 @@ addToCorpusWithForm cid (WithForm ft d) logStatus = do
let
parse
=
case
ft
of
CSV_HAL
->
parseHal'
CSV
->
parseCsv'
_
->
parseHal'
docs
=
splitEvery
500
$
take
1000000
$
parse
(
cs
d
)
CSV_HAL
->
parseFormat
CsvHal
CSV
->
parseFormat
CsvGargV3
_
->
parseFormat
CsvHal
docs
<-
liftIO
$
splitEvery
500
<$>
take
1000000
<$>
parse
(
cs
d
)
logStatus
ScraperStatus
{
_scst_succeeded
=
Just
1
,
_scst_failed
=
Just
0
...
...
src/Gargantext/Text/Corpus/Parsers.hs
View file @
5da469ed
...
...
@@ -22,38 +22,38 @@ please follow the types.
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Text.Corpus.Parsers
(
FileFormat
(
..
),
clean
,
parseFile
,
cleanText
)
module
Gargantext.Text.Corpus.Parsers
(
FileFormat
(
..
),
clean
,
parseFile
,
cleanText
,
parseFormat
)
where
--import Data.ByteString (ByteString)
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Control.Monad
(
join
)
import
qualified
Data.ByteString.Char8
as
DBC
import
Data.Attoparsec.ByteString
(
parseOnly
,
Parser
)
import
Data.Either
(
Either
(
..
))
import
Data.Either.Extra
(
partitionEithers
)
import
Data.List
(
concat
)
import
Data.List
(
lookup
)
import
Data.List
(
concat
,
lookup
)
import
Data.Ord
()
import
Data.String
(
String
())
import
Data.String
()
import
Data.Text
(
Text
)
import
Data.Text.Encoding
(
decodeUtf8
)
import
Data.Tuple.Extra
(
both
,
first
,
second
)
import
System.FilePath
(
FilePath
(),
takeExtension
)
import
qualified
Data.ByteString
as
DB
import
qualified
Data.Map
as
DM
import
qualified
Data.Text
as
DT
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Prelude
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
qualified
Gargantext.Text.Corpus.Parsers.WOS
as
WOS
import
qualified
Gargantext.Text.Corpus.Parsers.RIS
as
RIS
import
Gargantext.Prelude
import
Gargantext.Text.Corpus.Parsers.CSV
(
parseHal
,
parseHal'
,
parseCsv
,
parseCsv'
)
import
Gargantext.Text.Corpus.Parsers.RIS.Presse
(
presseEnrich
)
import
qualified
Gargantext.Text.Corpus.Parsers.Date
as
Date
import
Gargantext.Text.Corpus.Parsers.CSV
(
parseHal
,
parseCsv
)
import
Gargantext.Text.Learn
(
detectLangDefault
)
import
System.FilePath
(
FilePath
(),
takeExtension
)
import
qualified
Data.ByteString
as
DB
import
qualified
Data.ByteString.Lazy
as
DBL
import
qualified
Data.ByteString.Char8
as
DBC
import
qualified
Data.Map
as
DM
import
qualified
Data.Text
as
DT
import
qualified
Gargantext.Text.Corpus.Parsers.Date
as
Date
import
qualified
Gargantext.Text.Corpus.Parsers.RIS
as
RIS
import
qualified
Gargantext.Text.Corpus.Parsers.WOS
as
WOS
------------------------------------------------------------------------
type
ParseError
=
String
...
...
@@ -68,7 +68,7 @@ type ParseError = String
-- | According to the format of Input file,
-- different parser are available.
data
FileFormat
=
WOS
|
RIS
|
RisPresse
|
CsvGargV3
|
CsvHal
Format
|
CsvGargV3
|
CsvHal
deriving
(
Show
)
-- Implemented (ISI Format)
...
...
@@ -78,20 +78,30 @@ data FileFormat = WOS | RIS | RisPresse
-- | XML -- Not Implemented / see :
{-
parseFormat :: FileFormat -> ByteString -> [HyperdataDocument]
parseFormat = undefined
-}
parseFormat
::
FileFormat
->
DB
.
ByteString
->
IO
[
HyperdataDocument
]
parseFormat
CsvGargV3
bs
=
pure
$
parseCsv'
$
DBL
.
fromStrict
bs
parseFormat
CsvHal
bs
=
pure
$
parseHal'
$
DBL
.
fromStrict
bs
parseFormat
RisPresse
bs
=
mapM
(
toDoc
RIS
)
<$>
snd
<$>
enrichWith
RisPresse
$
partitionEithers
$
[
runParser'
RisPresse
bs
]
parseFormat
WOS
bs
=
mapM
(
toDoc
WOS
)
<$>
snd
<$>
enrichWith
WOS
$
partitionEithers
$
[
runParser'
WOS
bs
]
parseFormat
_
_
=
undefined
-- | Parse file into documents
-- TODO manage errors here
-- TODO: to debug maybe add the filepath in error message
parseFile
::
FileFormat
->
FilePath
->
IO
[
HyperdataDocument
]
parseFile
CsvHal
Format
p
=
parseHal
p
parseFile
CsvHal
p
=
parseHal
p
parseFile
CsvGargV3
p
=
parseCsv
p
parseFile
RisPresse
p
=
join
$
mapM
(
toDoc
RIS
)
<$>
snd
<$>
enrichWith
RisPresse
<$>
readFileWith
RIS
p
parseFile
WOS
p
=
join
$
mapM
(
toDoc
WOS
)
<$>
snd
<$>
enrichWith
WOS
<$>
readFileWith
WOS
p
parseFile
ff
p
=
join
$
mapM
(
toDoc
ff
)
<$>
snd
<$>
enrichWith
ff
<$>
readFileWith
ff
p
parseFile
ff
p
=
join
$
mapM
(
toDoc
ff
)
<$>
snd
<$>
enrichWith
ff
<$>
readFileWith
ff
p
toDoc
::
FileFormat
->
[(
Text
,
Text
)]
->
IO
HyperdataDocument
-- TODO use language for RIS
...
...
@@ -130,12 +140,14 @@ enrichWith WOS = enrichWith' (map (first WOS.keys))
enrichWith
_
=
enrichWith'
identity
enrichWith'
::
([(
DB
.
ByteString
,
DB
.
ByteString
)]
->
[(
DB
.
ByteString
,
DB
.
ByteString
)])
enrichWith'
::
([(
DB
.
ByteString
,
DB
.
ByteString
)]
->
[(
DB
.
ByteString
,
DB
.
ByteString
)])
->
(
a
,
[[[(
DB
.
ByteString
,
DB
.
ByteString
)]]])
->
(
a
,
[[(
Text
,
Text
)]])
enrichWith'
f
=
second
(
map
both'
.
map
f
.
concat
)
where
both'
=
map
(
both
decodeUtf8
)
readFileWith
::
FileFormat
->
FilePath
->
IO
([
ParseError
],
[[[(
DB
.
ByteString
,
DB
.
ByteString
)]]])
readFileWith
format
path
=
do
...
...
@@ -157,7 +169,11 @@ withParser _ = panic "[ERROR] Parser not implemented yet"
runParser
::
FileFormat
->
DB
.
ByteString
->
IO
(
Either
String
[[(
DB
.
ByteString
,
DB
.
ByteString
)]])
runParser
format
text
=
pure
$
parseOnly
(
withParser
format
)
text
runParser
format
text
=
pure
$
runParser'
format
text
runParser'
::
FileFormat
->
DB
.
ByteString
->
(
Either
String
[[(
DB
.
ByteString
,
DB
.
ByteString
)]])
runParser'
format
text
=
parseOnly
(
withParser
format
)
text
openZip
::
FilePath
->
IO
[
DB
.
ByteString
]
openZip
fp
=
do
...
...
src/Gargantext/Text/Corpus/Parsers/CSV.hs
View file @
5da469ed
...
...
@@ -32,8 +32,8 @@ import Gargantext.Prelude hiding (length)
import
Gargantext.Text
import
Gargantext.Text.Context
import
qualified
Data.ByteString.Lazy
as
BL
import
qualified
Data.ByteString
as
BS
import
qualified
Data.Vector
as
V
import
qualified
Data.ByteString
as
BS
import
qualified
Data.Vector
as
V
---------------------------------------------------------------
headerCsvGargV3
::
Header
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment