Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
157
Issues
157
List
Board
Labels
Milestones
Merge Requests
9
Merge Requests
9
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
2cd0e36a
Commit
2cd0e36a
authored
Dec 12, 2017
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[DRAFT] Parser main functions, for meeting.
parent
5c1b33ff
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
123 additions
and
60 deletions
+123
-60
Parsers.hs
src/Data/Gargantext/Parsers.hs
+97
-4
WOS.hs
src/Data/Gargantext/Parsers/WOS.hs
+16
-55
Main.hs
src/Data/Gargantext/Types/Main.hs
+10
-1
No files found.
src/Data/Gargantext/Parsers.hs
View file @
2cd0e36a
{-|
Module : Data.Gargantext.Parsers
Description : All parsers of Gargantext in one file.
Copyright : (c) CNRS, 2017
License : AGPL + CECILL v3
Maintainer : alexandre.delanoe@iscpif.fr
Stability : experimental
Portability : POSIX
Gargantext enables analyzing semi-structured text that should be parsed
in order to be analyzed.
The parsers suppose, we know the format of the Text (TextFormat data
type) according which the right parser is chosen among the list of
available parsers.
This module mainly describe how to add a new parser to Gargantext,
please follow the types.
-}
module
Data.Gargantext.Parsers
(
module
Data
.
Gargantext
.
Parsers
.
WOS
,
module
Data
.
Gargantext
.
Parsers
.
Date
--, module Data.Gargantext.Parsers.XML
--, module Data.Gargantext.Parsers.DOC
--, module Data.Gargantext.Parsers.ODS
)
where
import
Data.Gargantext.Parsers.WOS
import
Data.Gargantext.Parsers.Date
import
Data.Attoparsec.ByteString
import
Data.ByteString
(
ByteString
)
import
Data.Map
as
DM
import
Data.Either.Extra
(
Either
(
..
))
import
Control.Monad
(
join
)
import
Codec.Archive.Zip
import
Path.IO
(
resolveFile'
)
-- import qualified Data.ByteString.Lazy as B
import
Control.Applicative
(
(
<$>
)
)
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.Gargantext.Parsers.WOS
(
wosParser
)
-- import Data.Gargantext.Parsers.XML (xmlParser)
-- import Data.Gargantext.Parsers.DOC (docParser)
-- import Data.Gargantext.Parsers.ODS (odsParser)
import
Data.Gargantext.Prelude
import
Data.Gargantext.Types.Main
(
ErrorMessage
(),
GargParser
(),
Corpus
)
-- | According to the format of Input file,
-- different parser are available.
data
FileFormat
=
WOS
-- Implemented (ISI Format)
|
XML
-- Not Implemented / see :
-- > http://chrisdone.com/posts/fast-haskell-c-parsing-xml
|
DOC
-- Not Implemented / import Pandoc
|
ODS
-- Not Implemented / import Pandoc
|
PDF
-- Not Implemented / pdftotext and import Pandoc ?
-- | withParser:
-- According the format of the text, choosing the right parser.
withParser
::
FileFormat
->
GargParser
withParser
WOS
=
wosParser
--withParser XML = xmlParser
--withParser DOC = docParser
--withParser ODS = odsParser
withParser
_
=
error
"[ERROR] Parser not implemented yet"
runParser
::
FileFormat
->
ByteString
->
Either
ErrorMessage
(
IO
(
Maybe
Corpus
))
runParser
format
text
=
parseOnly
(
withParser
format
)
text
parseZip
::
FilePath
->
ByteString
->
IO
Corpus
parseZip
=
undefined
parseFile
::
FileFormat
->
ByteString
->
IO
Corpus
parseFile
p
x
=
case
runParser
p
x
of
Left
_
->
pure
0
Right
r
->
pure
$
length
r
openZipFiles
::
FilePath
->
IO
[
ByteString
]
openZipFiles
fp
=
do
path
<-
resolveFile'
fp
entries
<-
withArchive
path
(
DM
.
keys
<$>
getEntries
)
bs
<-
mapConcurrently
(
\
s
->
withArchive
path
(
getEntry
s
))
entries
pure
bs
wosParserTest
::
FilePath
->
IO
[
Int
]
wosParserTest
fp
=
join
$
mapConcurrently
(
parseFile
WOS
)
<$>
openZipFiles
fp
src/Data/Gargantext/Parsers/WOS.hs
View file @
2cd0e36a
...
...
@@ -2,53 +2,43 @@
module
Data.Gargantext.Parsers.WOS
where
-- TOFIX : Should import Data.Gargantext.Prelude here
import
Prelude
hiding
(
takeWhile
,
take
,
concat
,
readFile
)
import
qualified
Data.List
as
DL
import
Data.Map
as
DM
import
Data.Attoparsec.ByteString
import
Data.Attoparsec.ByteString.Char8
(
anyChar
,
isEndOfLine
)
import
Data.ByteString
(
ByteString
)
import
Data.ByteString.Char8
(
pack
)
import
Data.Either.Extra
(
Either
(
..
))
import
Control.Applicative
import
Control.Monad
(
join
)
-- To be removed just for Tests
--
-- import Codec.Archive.LibZip (withArchive, fileNames, sourceFile, addFile)
--import Codec.Archive.LibZip.Types (ZipSource, OpenFlag (CreateFlag))
import
Control.Concurrent.Async
as
CCA
(
mapConcurrently
)
import
Data.Gargantext.Types
import
Codec.Archive.Zip
import
Path.IO
(
resolveFile'
)
-- import qualified Data.ByteString.Lazy as B
import
Control.Applicative
(
(
<$>
)
)
-- type Parser a = a -> Text -> [Document]
data
ParserType
=
WOS
|
CSV
-- | wosParser parses ISI format from
-- Web Of Science Database
wosParser
::
ByteString
->
IO
Corpus
wosParser
=
undefined
type
WosDoc
=
ByteString
wosParser
::
Parser
[
Maybe
[
WosDoc
]]
wosParser
=
do
wosParser'
::
Parser
[
Maybe
[
ByteString
]]
wosParser'
=
do
-- TODO Warning if version /= 1.0
-- FIXME anyChar (string ..) /= exact string "\nVR 1.0" ?
_
<-
manyTill
anyChar
(
string
$
pack
"
\n
VR 1.0"
)
ns
<-
many1
wosNotice
<*
(
string
$
pack
"
\n
EF"
)
return
ns
wosNotice
::
Parser
(
Maybe
[
WosDoc
])
wosNotice
::
Parser
(
Maybe
[
ByteString
])
wosNotice
=
startNotice
*>
wosFields
<*
endNotice
where
endNotice
::
Parser
[
Char
]
endNotice
=
manyTill
anyChar
(
string
$
pack
"
\n
ER
\n
"
)
endNotice
::
Parser
[
Char
]
endNotice
=
manyTill
anyChar
(
string
$
pack
"
\n
ER
\n
"
)
startNotice
::
Parser
ByteString
startNotice
=
"
\n
PT "
*>
takeTill
isEndOfLine
startNotice
::
Parser
ByteString
startNotice
=
"
\n
PT "
*>
takeTill
isEndOfLine
field'
::
Parser
(
ByteString
,
[
ByteString
])
field'
=
do
...
...
@@ -80,7 +70,7 @@ wosFields = do
-- DL.lookup "URL" ws
-- DL.lookup "PA" ws
-- DL.lookup "TI" ws
--
wosLines
::
Parser
[
ByteString
]
wosLines
=
many
line
...
...
@@ -88,32 +78,3 @@ wosLines = many line
line
::
Parser
ByteString
line
=
"
\n
"
*>
takeTill
isEndOfLine
runParser
::
ParserType
->
ByteString
->
Either
String
[
Maybe
[
WosDoc
]]
runParser
p
x
=
parseOnly
parser
x
where
parser
=
case
p
of
WOS
->
wosParser
_
->
error
"Not implemented yet"
-- isTokenChar :: Word8 -> Bool
-- isTokenChar = inClass "!#$%&'()*+./0-9:<=>?@a-zA-Z[]^_`{|}~-\n"
zipFiles
::
FilePath
->
IO
[
ByteString
]
zipFiles
fp
=
do
path
<-
resolveFile'
fp
entries
<-
withArchive
path
(
DM
.
keys
<$>
getEntries
)
bs
<-
mapConcurrently
(
\
s
->
withArchive
path
(
getEntry
s
))
entries
pure
bs
parseFile
::
ParserType
->
ByteString
->
IO
Int
parseFile
p
x
=
case
runParser
p
x
of
Left
_
->
pure
0
Right
r
->
pure
$
length
r
parseWos
::
FilePath
->
IO
[
Int
]
parseWos
fp
=
join
$
mapConcurrently
(
parseFile
WOS
)
<$>
zipFiles
fp
src/Data/Gargantext/Types/Main.hs
View file @
2cd0e36a
...
...
@@ -5,6 +5,8 @@
module
Data.Gargantext.Types.Main
where
import
Protolude
(
fromMaybe
)
import
Data.ByteString
(
ByteString
())
import
Data.Text
(
Text
)
import
Data.Time
(
UTCTime
)
import
Data.Gargantext.Types.Node
(
NodePoly
...
...
@@ -17,7 +19,8 @@ import Data.Gargantext.Types.Node ( NodePoly
)
-- | Language of a Text
-- For simplicity, we suppose Text as an homogenous language
data
Language
=
EN
|
FR
-- | DE | IT | SP
-- > EN == english
-- > FR == french
...
...
@@ -28,6 +31,12 @@ data Language = EN | FR -- | DE | IT | SP
type
Ngrams
=
(
Text
,
Text
,
Text
)
type
ErrorMessage
=
String
-- Parse Texts
type
GargParser
=
ByteString
->
Either
ErrorMessage
Corpus
-- | TODO add Symbolic Node / Document
-- TODO make instances of Nodes
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment