Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Julien Moutinho
haskell-gargantext
Commits
401c86e5
Commit
401c86e5
authored
Nov 16, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[TEXT][Parser] Hal CSV parser.
parent
95cd84b4
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
50 additions
and
19 deletions
+50
-19
Flow.hs
src/Gargantext/Database/Flow.hs
+4
-4
Parsers.hs
src/Gargantext/Text/Parsers.hs
+9
-11
CSV.hs
src/Gargantext/Text/Parsers/CSV.hs
+37
-4
No files found.
src/Gargantext/Database/Flow.hs
View file @
401c86e5
...
...
@@ -42,20 +42,20 @@ import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNod
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
Gargantext.Database.User
(
getUser
,
UserLight
(
..
),
Username
)
import
Gargantext.Prelude
import
Gargantext.Text.Parsers
(
parseDocs
,
FileFormat
(
WOS
)
)
import
Gargantext.Text.Parsers
(
parseDocs
,
FileFormat
)
type
UserId
=
Int
type
RootId
=
Int
type
CorpusId
=
Int
flowDatabase
::
FilePath
->
CorpusName
->
IO
[
Int
]
flowDatabase
fp
cName
=
do
flowDatabase
::
File
Format
->
File
Path
->
CorpusName
->
IO
[
Int
]
flowDatabase
f
f
f
p
cName
=
do
-- Corus Flow
(
masterUserId
,
_
,
corpusId
)
<-
subFlow
"gargantua"
"Big Corpus"
-- Documents Flow
hyperdataDocuments
<-
map
addUniqIds
<$>
parseDocs
WOS
fp
hyperdataDocuments
<-
map
addUniqIds
<$>
parseDocs
ff
fp
ids
<-
runCmd'
$
insertDocuments
masterUserId
corpusId
hyperdataDocuments
printDebug
"Docs IDs : "
(
length
ids
)
idsRepeat
<-
runCmd'
$
insertDocuments
masterUserId
corpusId
hyperdataDocuments
...
...
src/Gargantext/Text/Parsers.hs
View file @
401c86e5
...
...
@@ -56,6 +56,7 @@ import Gargantext.Prelude
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
Gargantext.Text.Parsers.WOS
(
wosParser
)
import
Gargantext.Text.Parsers.Date
(
parseDate
)
import
Gargantext.Text.Parsers.CSV
(
parseHal
)
import
Gargantext.Text.Terms.Stop
(
detectLang
)
------------------------------------------------------------------------
...
...
@@ -70,7 +71,7 @@ type ParseError = String
-- | According to the format of Input file,
-- different parser are available.
data
FileFormat
=
WOS
data
FileFormat
=
WOS
|
CsvHalFormat
-- | CsvGargV3
deriving
(
Show
)
-- Implemented (ISI Format)
...
...
@@ -86,11 +87,10 @@ data FileFormat = WOS
-- | Parse file into documents
-- TODO manage errors here
parseDocs
::
FileFormat
->
FilePath
->
IO
[
HyperdataDocument
]
parseDocs
format
path
=
do
docs
<-
snd
<$>
parse
format
path
mapM
(
toDoc
format
)
docs
parseDocs
WOS
path
=
join
$
mapM
(
toDoc
WOS
)
<$>
snd
<$>
parse
WOS
path
parseDocs
CsvHalFormat
p
=
parseHal
p
type
Year
=
Int
type
Year
=
Int
type
Month
=
Int
type
Day
=
Int
...
...
@@ -102,11 +102,11 @@ parseDate' l (Just txt) = do
utcTime
<-
parseDate
l
txt
let
(
UTCTime
day
_
)
=
utcTime
let
(
y
,
m
,
d
)
=
DT
.
toGregorian
day
pure
(
Just
utcTime
,
(
Just
(
fromIntegral
y
),
Just
m
,
Just
d
))
pure
(
Just
utcTime
,
(
Just
(
fromIntegral
y
),
Just
m
,
Just
d
))
toDoc
::
FileFormat
->
[(
Text
,
Text
)]
->
IO
HyperdataDocument
toDoc
format
d
=
do
toDoc
WOS
d
=
do
let
abstract
=
lookup
"abstract"
d
let
lang
=
maybe
EN
identity
(
join
$
detectLang
<$>
(
fmap
(
DT
.
take
50
)
abstract
))
...
...
@@ -115,7 +115,7 @@ toDoc format d = do
(
utcTime
,
(
pub_year
,
pub_month
,
pub_day
))
<-
parseDate'
lang
dateToParse
pure
$
HyperdataDocument
(
Just
$
DT
.
pack
$
show
format
)
pure
$
HyperdataDocument
(
Just
$
DT
.
pack
$
show
WOS
)
(
lookup
"doi"
d
)
(
lookup
"URL"
d
)
Nothing
...
...
@@ -134,7 +134,6 @@ toDoc format d = do
Nothing
(
Just
$
(
DT
.
pack
.
show
)
lang
)
parse
::
FileFormat
->
FilePath
->
IO
([
ParseError
],
[[(
Text
,
Text
)]])
parse
format
path
=
do
files
<-
case
takeExtension
path
of
...
...
@@ -157,7 +156,7 @@ withParser WOS = wosParser
--withParser XML = xmlParser
--withParser _ = error "[ERROR] Parser not implemented yet"
runParser
::
FileFormat
->
DB
.
ByteString
runParser
::
FileFormat
->
DB
.
ByteString
->
IO
(
Either
String
[[(
DB
.
ByteString
,
DB
.
ByteString
)]])
runParser
format
text
=
pure
$
parseOnly
(
withParser
format
)
text
...
...
@@ -173,4 +172,3 @@ clean txt = DT.map clean' txt
clean'
'’'
=
'
\'
'
clean'
c
=
c
src/Gargantext/Text/Parsers/CSV.hs
View file @
401c86e5
...
...
@@ -25,8 +25,9 @@ import Control.Applicative
import
Data.Char
(
ord
)
import
Data.Csv
import
Data.Either
(
Either
(
Left
,
Right
))
import
Data.Text
(
Text
,
pack
,
length
,
intercalate
)
import
Data.Text
(
Text
,
pack
,
length
,
intercalate
,
unpack
)
import
qualified
Data.ByteString.Lazy
as
BL
import
Data.Time.Segment
(
jour
)
import
Data.Vector
(
Vector
)
import
qualified
Data.Vector
as
V
...
...
@@ -194,8 +195,6 @@ readHal fp = do
Left
e
->
panic
(
pack
e
)
Right
csvDocs
->
pure
csvDocs
------------------------------------------------------------------------
writeCsv
::
FilePath
->
(
Header
,
Vector
CsvDoc
)
->
IO
()
writeCsv
fp
(
h
,
vs
)
=
BL
.
writeFile
fp
$
encodeByNameWith
csvEncodeOptions
h
(
V
.
toList
vs
)
...
...
@@ -206,7 +205,7 @@ writeCsv fp (h, vs) = BL.writeFile fp $
data
CsvHal
=
CsvHal
{
csvHal_title
::
!
Text
,
csvHal_source
::
!
Text
,
csvHal_publication_year
::
!
Int
,
csvHal_publication_year
::
!
Int
eger
,
csvHal_publication_month
::
!
Int
,
csvHal_publication_day
::
!
Int
,
csvHal_abstract
::
!
Text
...
...
@@ -257,9 +256,11 @@ instance ToNamedRecord CsvHal where
toNamedRecord
(
CsvHal
t
s
py
pm
pd
abst
aut
url
isbn
iss
jour
lang
doi
auth
inst
dept
lab
team
doct
)
=
namedRecord
[
"title"
.=
t
,
"source"
.=
s
,
"publication_year"
.=
py
,
"publication_month"
.=
pm
,
"publication_day"
.=
pd
,
"abstract"
.=
abst
,
"authors"
.=
aut
...
...
@@ -278,3 +279,35 @@ instance ToNamedRecord CsvHal where
,
"rteamStructId_i"
.=
team
,
"docType_s"
.=
doct
]
csvHal2doc
::
CsvHal
->
HyperdataDocument
csvHal2doc
(
CsvHal
title
source
pub_year
pub_month
pub_day
abstract
authors
url
_
_
_
_
doi
_
_
_
_
_
_
)
=
HyperdataDocument
(
Just
"CsvHal"
)
(
Just
doi
)
(
Just
url
)
Nothing
Nothing
Nothing
(
Just
title
)
(
Just
authors
)
(
Just
source
)
(
Just
abstract
)
(
Just
$
pack
.
show
$
jour
pub_year
pub_month
pub_day
)
(
Just
$
fromIntegral
pub_year
)
(
Just
pub_month
)
(
Just
pub_day
)
Nothing
Nothing
Nothing
Nothing
------------------------------------------------------------------------
parseHal
::
FilePath
->
IO
[
HyperdataDocument
]
parseHal
fp
=
map
csvHal2doc
<$>
V
.
toList
<$>
snd
<$>
readHal
fp
------------------------------------------------------------------------
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment