Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
haskell-gargantext
Commits
5da469ed
Commit
5da469ed
authored
Jan 27, 2020
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[UPLOAD] 2 others file format
parent
e4cbfa19
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
49 additions
and
32 deletions
+49
-32
Main.hs
bin/gargantext-import/Main.hs
+2
-2
New.hs
src/Gargantext/API/Corpus/New.hs
+9
-8
Parsers.hs
src/Gargantext/Text/Corpus/Parsers.hs
+38
-22
No files found.
bin/gargantext-import/Main.hs
View file @
5da469ed
...
...
@@ -51,12 +51,12 @@ main = do
let
--tt = (Unsupervised EN 6 0 Nothing)
tt
=
(
Multi
EN
)
format
=
CsvGargV3
-- CsvHal
Format
--WOS
format
=
CsvGargV3
-- CsvHal --WOS
corpus
::
forall
m
.
FlowCmdM
DevEnv
GargError
m
=>
m
CorpusId
corpus
=
flowCorpusFile
(
cs
user
)
(
Left
(
cs
name
::
Text
))
(
read
limit
::
Int
)
tt
format
corpusPath
corpusCsvHal
::
forall
m
.
FlowCmdM
DevEnv
GargError
m
=>
m
CorpusId
corpusCsvHal
=
flowCorpusFile
(
cs
user
)
(
Left
(
cs
name
::
Text
))
(
read
limit
::
Int
)
tt
CsvHal
Format
corpusPath
corpusCsvHal
=
flowCorpusFile
(
cs
user
)
(
Left
(
cs
name
::
Text
))
(
read
limit
::
Int
)
tt
CsvHal
corpusPath
annuaire
::
forall
m
.
FlowCmdM
DevEnv
GargError
m
=>
m
CorpusId
annuaire
=
flowAnnuaire
(
cs
user
)
(
Left
"Annuaire"
)
(
Multi
EN
)
corpusPath
...
...
src/Gargantext/API/Corpus/New.hs
View file @
5da469ed
...
...
@@ -43,7 +43,7 @@ import Gargantext.Database.Types.Node (CorpusId)
import
Gargantext.Database.Types.Node
(
ToHyperdataDocument
(
..
))
import
Gargantext.Database.Types.Node
(
UserId
)
import
Gargantext.Prelude
import
Gargantext.Text.Corpus.Parsers
.CSV
(
parseCsv'
,
parseHal'
)
import
Gargantext.Text.Corpus.Parsers
(
FileFormat
(
..
),
parseFormat
)
import
Gargantext.Text.Terms
(
TermType
(
..
))
import
Servant
import
Servant.API.Flatten
(
Flat
)
...
...
@@ -227,13 +227,14 @@ addToCorpusWithForm cid (WithForm ft d) logStatus = do
let
parse
=
case
ft
of
CSV_HAL
->
parseHal'
CSV
->
parseCsv'
_
->
parseHal'
docs
=
splitEvery
500
$
take
1000000
$
parse
(
cs
d
)
CSV_HAL
->
parseFormat
CsvHal
CSV
->
parseFormat
CsvGargV3
_
->
parseFormat
CsvHal
docs
<-
liftIO
$
splitEvery
500
<$>
take
1000000
<$>
parse
(
cs
d
)
logStatus
ScraperStatus
{
_scst_succeeded
=
Just
1
,
_scst_failed
=
Just
0
...
...
src/Gargantext/Text/Corpus/Parsers.hs
View file @
5da469ed
...
...
@@ -22,38 +22,38 @@ please follow the types.
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Text.Corpus.Parsers
(
FileFormat
(
..
),
clean
,
parseFile
,
cleanText
)
module
Gargantext.Text.Corpus.Parsers
(
FileFormat
(
..
),
clean
,
parseFile
,
cleanText
,
parseFormat
)
where
--import Data.ByteString (ByteString)
import
"zip"
Codec.Archive.Zip
(
withArchive
,
getEntry
,
getEntries
)
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Control.Monad
(
join
)
import
qualified
Data.ByteString.Char8
as
DBC
import
Data.Attoparsec.ByteString
(
parseOnly
,
Parser
)
import
Data.Either
(
Either
(
..
))
import
Data.Either.Extra
(
partitionEithers
)
import
Data.List
(
concat
)
import
Data.List
(
lookup
)
import
Data.List
(
concat
,
lookup
)
import
Data.Ord
()
import
Data.String
(
String
())
import
Data.String
()
import
Data.Text
(
Text
)
import
Data.Text.Encoding
(
decodeUtf8
)
import
Data.Tuple.Extra
(
both
,
first
,
second
)
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
Gargantext.Prelude
import
Gargantext.Text.Corpus.Parsers.CSV
(
parseHal
,
parseHal'
,
parseCsv
,
parseCsv'
)
import
Gargantext.Text.Corpus.Parsers.RIS.Presse
(
presseEnrich
)
import
Gargantext.Text.Learn
(
detectLangDefault
)
import
System.FilePath
(
FilePath
(),
takeExtension
)
import
qualified
Data.ByteString
as
DB
import
qualified
Data.ByteString.Lazy
as
DBL
import
qualified
Data.ByteString.Char8
as
DBC
import
qualified
Data.Map
as
DM
import
qualified
Data.Text
as
DT
import
Gargantext.Core
(
Lang
(
..
))
import
Gargantext.Prelude
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
qualified
Gargantext.Text.Corpus.Parsers.WOS
as
WOS
import
qualified
Gargantext.Text.Corpus.Parsers.RIS
as
RIS
import
Gargantext.Text.Corpus.Parsers.RIS.Presse
(
presseEnrich
)
import
qualified
Gargantext.Text.Corpus.Parsers.Date
as
Date
import
Gargantext.Text.Corpus.Parsers.CSV
(
parseHal
,
parseCsv
)
import
Gargantext.Text.Learn
(
detectLangDefault
)
import
qualified
Gargantext.Text.Corpus.Parsers.RIS
as
RIS
import
qualified
Gargantext.Text.Corpus.Parsers.WOS
as
WOS
------------------------------------------------------------------------
type
ParseError
=
String
...
...
@@ -68,7 +68,7 @@ type ParseError = String
-- | According to the format of Input file,
-- different parser are available.
data
FileFormat
=
WOS
|
RIS
|
RisPresse
|
CsvGargV3
|
CsvHal
Format
|
CsvGargV3
|
CsvHal
deriving
(
Show
)
-- Implemented (ISI Format)
...
...
@@ -78,16 +78,26 @@ data FileFormat = WOS | RIS | RisPresse
-- | XML -- Not Implemented / see :
{-
parseFormat :: FileFormat -> ByteString -> [HyperdataDocument]
parseFormat = undefined
-}
parseFormat
::
FileFormat
->
DB
.
ByteString
->
IO
[
HyperdataDocument
]
parseFormat
CsvGargV3
bs
=
pure
$
parseCsv'
$
DBL
.
fromStrict
bs
parseFormat
CsvHal
bs
=
pure
$
parseHal'
$
DBL
.
fromStrict
bs
parseFormat
RisPresse
bs
=
mapM
(
toDoc
RIS
)
<$>
snd
<$>
enrichWith
RisPresse
$
partitionEithers
$
[
runParser'
RisPresse
bs
]
parseFormat
WOS
bs
=
mapM
(
toDoc
WOS
)
<$>
snd
<$>
enrichWith
WOS
$
partitionEithers
$
[
runParser'
WOS
bs
]
parseFormat
_
_
=
undefined
-- | Parse file into documents
-- TODO manage errors here
-- TODO: to debug maybe add the filepath in error message
parseFile
::
FileFormat
->
FilePath
->
IO
[
HyperdataDocument
]
parseFile
CsvHal
Format
p
=
parseHal
p
parseFile
CsvHal
p
=
parseHal
p
parseFile
CsvGargV3
p
=
parseCsv
p
parseFile
RisPresse
p
=
join
$
mapM
(
toDoc
RIS
)
<$>
snd
<$>
enrichWith
RisPresse
<$>
readFileWith
RIS
p
parseFile
WOS
p
=
join
$
mapM
(
toDoc
WOS
)
<$>
snd
<$>
enrichWith
WOS
<$>
readFileWith
WOS
p
...
...
@@ -136,6 +146,8 @@ enrichWith' f = second (map both' . map f . concat)
where
both'
=
map
(
both
decodeUtf8
)
readFileWith
::
FileFormat
->
FilePath
->
IO
([
ParseError
],
[[[(
DB
.
ByteString
,
DB
.
ByteString
)]]])
readFileWith
format
path
=
do
...
...
@@ -157,7 +169,11 @@ withParser _ = panic "[ERROR] Parser not implemented yet"
runParser
::
FileFormat
->
DB
.
ByteString
->
IO
(
Either
String
[[(
DB
.
ByteString
,
DB
.
ByteString
)]])
runParser
format
text
=
pure
$
parseOnly
(
withParser
format
)
text
runParser
format
text
=
pure
$
runParser'
format
text
runParser'
::
FileFormat
->
DB
.
ByteString
->
(
Either
String
[[(
DB
.
ByteString
,
DB
.
ByteString
)]])
runParser'
format
text
=
parseOnly
(
withParser
format
)
text
openZip
::
FilePath
->
IO
[
DB
.
ByteString
]
openZip
fp
=
do
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment