Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
H
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Przemyslaw Kaminski
haskell-gargantext
Commits
cc31b225
Commit
cc31b225
authored
Jul 02, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[SPECS] index a corpus with term list.
parent
bd47a5e3
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
247 additions
and
1 deletion
+247
-1
.gitignore
.gitignore
+0
-1
Main.hs
bin/gargantext-cli/Main.hs
+46
-0
List.hs
src/Gargantext/Text/List.hs
+62
-0
CSV.hs
src/Gargantext/Text/List/CSV.hs
+103
-0
Types.hs
src/Gargantext/Text/List/Types.hs
+36
-0
No files found.
.gitignore
View file @
cc31b225
...
...
@@ -3,7 +3,6 @@
*.cabal
*purescript-gargantext
doc
bin
deps
profiling
_darcs
bin/gargantext-cli/Main.hs
0 → 100644
View file @
cc31b225
{-|
Module : Main.hs
Description : Gargantext starter
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Main specifications to index a corpus with a term list
-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE Strict #-}
module
Main
where
import
qualified
Data.Vector
as
DV
import
Gargantext.Prelude
import
Data.Text
(
Text
)
import
Gargantext.Text.Parsers.CSV
(
readCsv
)
import
Gargantext.Text.List.CSV
(
fromCsvListFile
)
main
::
IO
()
main
=
do
[
corpusfile
,
termListFile
,
outputFile
]
<-
readParams
-- corpus :: [Text]
corpus
<-
DV
.
toList
<$>
map
DV
.
csv_abstract
<$>
readCsv
corpusFile
-- termListMap :: [Text]
termList
<-
termListMap
<$>
fromCsvListFile
termListFile
let
corpusIndexed
=
indexCorpusWith
corpus
termList
let
cooc
=
cooccurrences
corpusIndexed
writeFile
outputFile
cooc
src/Gargantext/Text/
Terms/Lists
.hs
→
src/Gargantext/Text/
List
.hs
View file @
cc31b225
...
...
@@ -9,23 +9,29 @@ Portability : POSIX
Here is a longer description of this module, containing some
commentary with @some markup@.
-}
{-# LANGUAGE NoImplicitPrelude #-}
module
Gargantext.Text.
Terms.Lists
module
Gargantext.Text.
List
where
--import Data.Maybe
--import Data.List (filter)
--import Gargantext.Text
import
Data.Text
(
Text
)
import
qualified
Data.Text
as
DT
import
Gargantext.Prelude
--
data
ListName
=
Stop
|
Candidate
|
Graph
deriving
(
Show
,
Eq
)
-- | TODO normalize text
-- | TODO Order the seperators in probability of apparition
separators
::
[
Text
]
separators
=
[
" "
,
","
,
"."
,
"?"
,
"!"
,
"
\"
"
]
isIn
::
Text
->
Text
->
Bool
isIn
term
context
=
any
(
\
x
->
DT
.
isInfixOf
x
context
)
$
map
(
\
sep
->
term
<>
sep
)
separators
------------------------------------------------------------------------
--graph :: [Ngrams] -> [Ngrams]
--graph ngs = filter (\ng -> _ngramsListName ng == Just Graph) ngs
--
...
...
@@ -34,4 +40,23 @@ data ListName = Stop | Candidate | Graph
--
--stop :: [Ngrams] -> [Ngrams]
--stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs
------------------------------------------------------------------------
-- | Attoparsec solution to index test
--import Data.Attoparsec.ByteString (Parser, parseOnly, try, string
-- , takeTill, take
-- , manyTill, many1)
--import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
--import Data.ByteString (ByteString, concat)
--import Data.ByteString.Char8 (pack)
--import Control.Applicative
-- | Attoparsec version
--indexParser :: (ByteString -> b) -> ByteString -> Parser b
--indexParser form2label x = do
-- _ <- manyTill anyChar (string x)
-- pure $ form2label x
--doIndex :: Applicative f => ByteString -> ByteString -> f (Either String [ByteString]
--doIndex f x txt = pure $ parseOnly (many $ indexParser f x) txt
------------------------------------------------------------------------
src/Gargantext/Text/List/CSV.hs
0 → 100644
View file @
cc31b225
{-|
Module : Gargantext.Text.List.CSV
Description :
Copyright : (c) CNRS, 2018-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
CSV parser for Gargantext corpus files.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
module
Gargantext.Text.List.CSV
where
import
GHC.Real
(
round
)
import
GHC.IO
(
FilePath
)
import
Control.Applicative
import
Control.Monad
(
mzero
)
import
Data.Char
(
ord
)
import
Data.Csv
import
Data.Either
(
Either
(
Left
,
Right
))
import
Data.Text
(
Text
,
pack
,
length
,
intercalate
)
import
qualified
Data.ByteString.Lazy
as
BL
import
Data.Vector
(
Vector
)
import
qualified
Data.Vector
as
V
import
Gargantext.Prelude
hiding
(
length
)
import
Gargantext.Text.List.Types
------------------------------------------------------------------------
--csv2lists :: Vector CsvList -> Lists
--csv2lists v = V.foldl' (\e (CsvList listType label forms) -> insertLists lt label forms e) emptyLists v
------------------------------------------------------------------------
data
CsvListType
=
CsvMap
|
CsvStop
|
CsvCandidate
deriving
(
Read
,
Show
,
Eq
)
------------------------------------------------------------------------
-- CSV List Main Configuration
csvListFieldDelimiter
::
Char
csvListFieldDelimiter
=
'
\t
'
csvListFormsDelimiter
::
Text
csvListFormsDelimiter
=
"|&|"
------------------------------------------------------------------------
data
CsvList
=
CsvList
{
csvList_status
::
!
CsvListType
,
csvList_label
::
!
Text
,
csvList_forms
::
!
Text
}
deriving
(
Show
)
------------------------------------------------------------------------
instance
FromNamedRecord
CsvList
where
parseNamedRecord
r
=
CsvList
<$>
r
.:
"status"
<*>
r
.:
"label"
<*>
r
.:
"forms"
instance
ToNamedRecord
CsvList
where
toNamedRecord
(
CsvList
s
l
f
)
=
namedRecord
[
"status"
.=
s
,
"label"
.=
l
,
"forms"
.=
f
]
------------------------------------------------------------------------
instance
FromField
CsvListType
where
parseField
"map"
=
pure
CsvMap
parseField
"main"
=
pure
CsvCandidate
parseField
"stop"
=
pure
CsvStop
parseField
_
=
mzero
instance
ToField
CsvListType
where
toField
CsvMap
=
"map"
toField
CsvCandidate
=
"main"
toField
CsvStop
=
"stop"
------------------------------------------------------------------------
csvDecodeOptions
::
DecodeOptions
csvDecodeOptions
=
(
defaultDecodeOptions
{
decDelimiter
=
fromIntegral
$
ord
csvListFieldDelimiter
}
)
csvEncodeOptions
::
EncodeOptions
csvEncodeOptions
=
(
defaultEncodeOptions
{
encDelimiter
=
fromIntegral
$
ord
csvListFieldDelimiter
}
)
------------------------------------------------------------------------
fromCsvListFile
::
FilePath
->
IO
(
Header
,
Vector
CsvList
)
fromCsvListFile
fp
=
do
csvData
<-
BL
.
readFile
fp
case
decodeByNameWith
csvDecodeOptions
csvData
of
Left
e
->
panic
(
pack
e
)
Right
csvList
->
pure
csvList
------------------------------------------------------------------------
toCsvListFile
::
FilePath
->
(
Header
,
Vector
CsvList
)
->
IO
()
toCsvListFile
fp
(
h
,
vs
)
=
BL
.
writeFile
fp
$
encodeByNameWith
csvEncodeOptions
h
(
V
.
toList
vs
)
------------------------------------------------------------------------
src/Gargantext/Text/List/Types.hs
0 → 100644
View file @
cc31b225
{-|
Module : Gargantext.Text.List.Types
Description :
Copyright : (c) CNRS, 2018-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
CSV parser for Gargantext corpus files.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Text.List.Types
where
import
Prelude
(
Bounded
,
Enum
,
minBound
,
maxBound
)
import
Data.Text
(
Text
)
import
Data.Map
(
Map
,
empty
,
fromList
,
insert
,
lookup
)
import
Gargantext.Prelude
-------------------------------------------------------------------
type
Label
=
Text
data
ListType
=
Map
|
Stop
|
Candidate
deriving
(
Show
,
Eq
,
Ord
,
Enum
,
Bounded
)
type
Lists
=
Map
ListType
(
Map
Text
[
Text
])
emptyLists
::
Lists
emptyLists
=
fromList
$
map
(
\
lt
->
(
lt
,
empty
))
([
minBound
..
maxBound
]
::
[
ListType
])
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment