Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
158
Issues
158
List
Board
Labels
Milestones
Merge Requests
11
Merge Requests
11
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
401c86e5
Commit
401c86e5
authored
Nov 16, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[TEXT][Parser] Hal CSV parser.
parent
95cd84b4
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
50 additions
and
19 deletions
+50
-19
Flow.hs
src/Gargantext/Database/Flow.hs
+4
-4
Parsers.hs
src/Gargantext/Text/Parsers.hs
+9
-11
CSV.hs
src/Gargantext/Text/Parsers/CSV.hs
+37
-4
No files found.
src/Gargantext/Database/Flow.hs
View file @
401c86e5
...
...
@@ -42,20 +42,20 @@ import Gargantext.Database.NodeNgramsNgrams (NodeNgramsNgramsPoly(..), insertNod
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
Gargantext.Database.User
(
getUser
,
UserLight
(
..
),
Username
)
import
Gargantext.Prelude
import
Gargantext.Text.Parsers
(
parseDocs
,
FileFormat
(
WOS
)
)
import
Gargantext.Text.Parsers
(
parseDocs
,
FileFormat
)
type
UserId
=
Int
type
RootId
=
Int
type
CorpusId
=
Int
flowDatabase
::
FilePath
->
CorpusName
->
IO
[
Int
]
flowDatabase
fp
cName
=
do
flowDatabase
::
File
Format
->
File
Path
->
CorpusName
->
IO
[
Int
]
flowDatabase
f
f
f
p
cName
=
do
-- Corus Flow
(
masterUserId
,
_
,
corpusId
)
<-
subFlow
"gargantua"
"Big Corpus"
-- Documents Flow
hyperdataDocuments
<-
map
addUniqIds
<$>
parseDocs
WOS
fp
hyperdataDocuments
<-
map
addUniqIds
<$>
parseDocs
ff
fp
ids
<-
runCmd'
$
insertDocuments
masterUserId
corpusId
hyperdataDocuments
printDebug
"Docs IDs : "
(
length
ids
)
idsRepeat
<-
runCmd'
$
insertDocuments
masterUserId
corpusId
hyperdataDocuments
...
...
src/Gargantext/Text/Parsers.hs
View file @
401c86e5
...
...
@@ -56,6 +56,7 @@ import Gargantext.Prelude
import
Gargantext.Database.Types.Node
(
HyperdataDocument
(
..
))
import
Gargantext.Text.Parsers.WOS
(
wosParser
)
import
Gargantext.Text.Parsers.Date
(
parseDate
)
import
Gargantext.Text.Parsers.CSV
(
parseHal
)
import
Gargantext.Text.Terms.Stop
(
detectLang
)
------------------------------------------------------------------------
...
...
@@ -70,7 +71,7 @@ type ParseError = String
-- | According to the format of Input file,
-- different parser are available.
data
FileFormat
=
WOS
data
FileFormat
=
WOS
|
CsvHalFormat
-- | CsvGargV3
deriving
(
Show
)
-- Implemented (ISI Format)
...
...
@@ -86,11 +87,10 @@ data FileFormat = WOS
-- | Parse file into documents
-- TODO manage errors here
parseDocs
::
FileFormat
->
FilePath
->
IO
[
HyperdataDocument
]
parseDocs
format
path
=
do
docs
<-
snd
<$>
parse
format
path
mapM
(
toDoc
format
)
docs
parseDocs
WOS
path
=
join
$
mapM
(
toDoc
WOS
)
<$>
snd
<$>
parse
WOS
path
parseDocs
CsvHalFormat
p
=
parseHal
p
type
Year
=
Int
type
Year
=
Int
type
Month
=
Int
type
Day
=
Int
...
...
@@ -102,11 +102,11 @@ parseDate' l (Just txt) = do
utcTime
<-
parseDate
l
txt
let
(
UTCTime
day
_
)
=
utcTime
let
(
y
,
m
,
d
)
=
DT
.
toGregorian
day
pure
(
Just
utcTime
,
(
Just
(
fromIntegral
y
),
Just
m
,
Just
d
))
pure
(
Just
utcTime
,
(
Just
(
fromIntegral
y
),
Just
m
,
Just
d
))
toDoc
::
FileFormat
->
[(
Text
,
Text
)]
->
IO
HyperdataDocument
toDoc
format
d
=
do
toDoc
WOS
d
=
do
let
abstract
=
lookup
"abstract"
d
let
lang
=
maybe
EN
identity
(
join
$
detectLang
<$>
(
fmap
(
DT
.
take
50
)
abstract
))
...
...
@@ -115,7 +115,7 @@ toDoc format d = do
(
utcTime
,
(
pub_year
,
pub_month
,
pub_day
))
<-
parseDate'
lang
dateToParse
pure
$
HyperdataDocument
(
Just
$
DT
.
pack
$
show
format
)
pure
$
HyperdataDocument
(
Just
$
DT
.
pack
$
show
WOS
)
(
lookup
"doi"
d
)
(
lookup
"URL"
d
)
Nothing
...
...
@@ -134,7 +134,6 @@ toDoc format d = do
Nothing
(
Just
$
(
DT
.
pack
.
show
)
lang
)
parse
::
FileFormat
->
FilePath
->
IO
([
ParseError
],
[[(
Text
,
Text
)]])
parse
format
path
=
do
files
<-
case
takeExtension
path
of
...
...
@@ -157,7 +156,7 @@ withParser WOS = wosParser
--withParser XML = xmlParser
--withParser _ = error "[ERROR] Parser not implemented yet"
runParser
::
FileFormat
->
DB
.
ByteString
runParser
::
FileFormat
->
DB
.
ByteString
->
IO
(
Either
String
[[(
DB
.
ByteString
,
DB
.
ByteString
)]])
runParser
format
text
=
pure
$
parseOnly
(
withParser
format
)
text
...
...
@@ -173,4 +172,3 @@ clean txt = DT.map clean' txt
clean'
'’'
=
'
\'
'
clean'
c
=
c
src/Gargantext/Text/Parsers/CSV.hs
View file @
401c86e5
...
...
@@ -25,8 +25,9 @@ import Control.Applicative
import
Data.Char
(
ord
)
import
Data.Csv
import
Data.Either
(
Either
(
Left
,
Right
))
import
Data.Text
(
Text
,
pack
,
length
,
intercalate
)
import
Data.Text
(
Text
,
pack
,
length
,
intercalate
,
unpack
)
import
qualified
Data.ByteString.Lazy
as
BL
import
Data.Time.Segment
(
jour
)
import
Data.Vector
(
Vector
)
import
qualified
Data.Vector
as
V
...
...
@@ -194,8 +195,6 @@ readHal fp = do
Left
e
->
panic
(
pack
e
)
Right
csvDocs
->
pure
csvDocs
------------------------------------------------------------------------
writeCsv
::
FilePath
->
(
Header
,
Vector
CsvDoc
)
->
IO
()
writeCsv
fp
(
h
,
vs
)
=
BL
.
writeFile
fp
$
encodeByNameWith
csvEncodeOptions
h
(
V
.
toList
vs
)
...
...
@@ -206,7 +205,7 @@ writeCsv fp (h, vs) = BL.writeFile fp $
data
CsvHal
=
CsvHal
{
csvHal_title
::
!
Text
,
csvHal_source
::
!
Text
,
csvHal_publication_year
::
!
Int
,
csvHal_publication_year
::
!
Int
eger
,
csvHal_publication_month
::
!
Int
,
csvHal_publication_day
::
!
Int
,
csvHal_abstract
::
!
Text
...
...
@@ -257,9 +256,11 @@ instance ToNamedRecord CsvHal where
toNamedRecord
(
CsvHal
t
s
py
pm
pd
abst
aut
url
isbn
iss
jour
lang
doi
auth
inst
dept
lab
team
doct
)
=
namedRecord
[
"title"
.=
t
,
"source"
.=
s
,
"publication_year"
.=
py
,
"publication_month"
.=
pm
,
"publication_day"
.=
pd
,
"abstract"
.=
abst
,
"authors"
.=
aut
...
...
@@ -278,3 +279,35 @@ instance ToNamedRecord CsvHal where
,
"rteamStructId_i"
.=
team
,
"docType_s"
.=
doct
]
csvHal2doc
::
CsvHal
->
HyperdataDocument
csvHal2doc
(
CsvHal
title
source
pub_year
pub_month
pub_day
abstract
authors
url
_
_
_
_
doi
_
_
_
_
_
_
)
=
HyperdataDocument
(
Just
"CsvHal"
)
(
Just
doi
)
(
Just
url
)
Nothing
Nothing
Nothing
(
Just
title
)
(
Just
authors
)
(
Just
source
)
(
Just
abstract
)
(
Just
$
pack
.
show
$
jour
pub_year
pub_month
pub_day
)
(
Just
$
fromIntegral
pub_year
)
(
Just
pub_month
)
(
Just
pub_day
)
Nothing
Nothing
Nothing
Nothing
------------------------------------------------------------------------
parseHal
::
FilePath
->
IO
[
HyperdataDocument
]
parseHal
fp
=
map
csvHal2doc
<$>
V
.
toList
<$>
snd
<$>
readHal
fp
------------------------------------------------------------------------
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment