Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
haskell-gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
158
Issues
158
List
Board
Labels
Milestones
Merge Requests
11
Merge Requests
11
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
haskell-gargantext
Commits
cc31b225
Commit
cc31b225
authored
Jul 02, 2018
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[SPECS] index a corpus with term list.
parent
bd47a5e3
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
247 additions
and
1 deletion
+247
-1
.gitignore
.gitignore
+0
-1
Main.hs
bin/gargantext-cli/Main.hs
+46
-0
List.hs
src/Gargantext/Text/List.hs
+62
-0
CSV.hs
src/Gargantext/Text/List/CSV.hs
+103
-0
Types.hs
src/Gargantext/Text/List/Types.hs
+36
-0
No files found.
.gitignore
View file @
cc31b225
...
@@ -3,7 +3,6 @@
...
@@ -3,7 +3,6 @@
*.cabal
*.cabal
*purescript-gargantext
*purescript-gargantext
doc
doc
bin
deps
deps
profiling
profiling
_darcs
_darcs
bin/gargantext-cli/Main.hs
0 → 100644
View file @
cc31b225
{-|
Module : Main.hs
Description : Gargantext starter
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Main specifications to index a corpus with a term list
-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE Strict #-}
module
Main
where
import
qualified
Data.Vector
as
DV
import
Gargantext.Prelude
import
Data.Text
(
Text
)
import
Gargantext.Text.Parsers.CSV
(
readCsv
)
import
Gargantext.Text.List.CSV
(
fromCsvListFile
)
main
::
IO
()
main
=
do
[
corpusfile
,
termListFile
,
outputFile
]
<-
readParams
-- corpus :: [Text]
corpus
<-
DV
.
toList
<$>
map
DV
.
csv_abstract
<$>
readCsv
corpusFile
-- termListMap :: [Text]
termList
<-
termListMap
<$>
fromCsvListFile
termListFile
let
corpusIndexed
=
indexCorpusWith
corpus
termList
let
cooc
=
cooccurrences
corpusIndexed
writeFile
outputFile
cooc
src/Gargantext/Text/
Terms/Lists
.hs
→
src/Gargantext/Text/
List
.hs
View file @
cc31b225
...
@@ -9,23 +9,29 @@ Portability : POSIX
...
@@ -9,23 +9,29 @@ Portability : POSIX
Here is a longer description of this module, containing some
Here is a longer description of this module, containing some
commentary with @some markup@.
commentary with @some markup@.
-}
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE NoImplicitPrelude #-}
module
Gargantext.Text.
Terms.Lists
module
Gargantext.Text.
List
where
where
--import Data.Maybe
import
Data.Text
(
Text
)
--import Data.List (filter)
import
qualified
Data.Text
as
DT
--import Gargantext.Text
import
Gargantext.Prelude
import
Gargantext.Prelude
--
data
ListName
=
Stop
|
Candidate
|
Graph
-- | TODO normalize text
deriving
(
Show
,
Eq
)
-- | TODO Order the seperators in probability of apparition
separators
::
[
Text
]
separators
=
[
" "
,
","
,
"."
,
"?"
,
"!"
,
"
\"
"
]
isIn
::
Text
->
Text
->
Bool
isIn
term
context
=
any
(
\
x
->
DT
.
isInfixOf
x
context
)
$
map
(
\
sep
->
term
<>
sep
)
separators
------------------------------------------------------------------------
--graph :: [Ngrams] -> [Ngrams]
--graph :: [Ngrams] -> [Ngrams]
--graph ngs = filter (\ng -> _ngramsListName ng == Just Graph) ngs
--graph ngs = filter (\ng -> _ngramsListName ng == Just Graph) ngs
--
--
...
@@ -34,4 +40,23 @@ data ListName = Stop | Candidate | Graph
...
@@ -34,4 +40,23 @@ data ListName = Stop | Candidate | Graph
--
--
--stop :: [Ngrams] -> [Ngrams]
--stop :: [Ngrams] -> [Ngrams]
--stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs
--stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs
------------------------------------------------------------------------
-- | Attoparsec solution to index test
--import Data.Attoparsec.ByteString (Parser, parseOnly, try, string
-- , takeTill, take
-- , manyTill, many1)
--import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
--import Data.ByteString (ByteString, concat)
--import Data.ByteString.Char8 (pack)
--import Control.Applicative
-- | Attoparsec version
--indexParser :: (ByteString -> b) -> ByteString -> Parser b
--indexParser form2label x = do
-- _ <- manyTill anyChar (string x)
-- pure $ form2label x
--doIndex :: Applicative f => ByteString -> ByteString -> f (Either String [ByteString]
--doIndex f x txt = pure $ parseOnly (many $ indexParser f x) txt
------------------------------------------------------------------------
src/Gargantext/Text/List/CSV.hs
0 → 100644
View file @
cc31b225
{-|
Module : Gargantext.Text.List.CSV
Description :
Copyright : (c) CNRS, 2018-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
CSV parser for Gargantext corpus files.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
module
Gargantext.Text.List.CSV
where
import
GHC.Real
(
round
)
import
GHC.IO
(
FilePath
)
import
Control.Applicative
import
Control.Monad
(
mzero
)
import
Data.Char
(
ord
)
import
Data.Csv
import
Data.Either
(
Either
(
Left
,
Right
))
import
Data.Text
(
Text
,
pack
,
length
,
intercalate
)
import
qualified
Data.ByteString.Lazy
as
BL
import
Data.Vector
(
Vector
)
import
qualified
Data.Vector
as
V
import
Gargantext.Prelude
hiding
(
length
)
import
Gargantext.Text.List.Types
------------------------------------------------------------------------
--csv2lists :: Vector CsvList -> Lists
--csv2lists v = V.foldl' (\e (CsvList listType label forms) -> insertLists lt label forms e) emptyLists v
------------------------------------------------------------------------
data
CsvListType
=
CsvMap
|
CsvStop
|
CsvCandidate
deriving
(
Read
,
Show
,
Eq
)
------------------------------------------------------------------------
-- CSV List Main Configuration
csvListFieldDelimiter
::
Char
csvListFieldDelimiter
=
'
\t
'
csvListFormsDelimiter
::
Text
csvListFormsDelimiter
=
"|&|"
------------------------------------------------------------------------
data
CsvList
=
CsvList
{
csvList_status
::
!
CsvListType
,
csvList_label
::
!
Text
,
csvList_forms
::
!
Text
}
deriving
(
Show
)
------------------------------------------------------------------------
instance
FromNamedRecord
CsvList
where
parseNamedRecord
r
=
CsvList
<$>
r
.:
"status"
<*>
r
.:
"label"
<*>
r
.:
"forms"
instance
ToNamedRecord
CsvList
where
toNamedRecord
(
CsvList
s
l
f
)
=
namedRecord
[
"status"
.=
s
,
"label"
.=
l
,
"forms"
.=
f
]
------------------------------------------------------------------------
instance
FromField
CsvListType
where
parseField
"map"
=
pure
CsvMap
parseField
"main"
=
pure
CsvCandidate
parseField
"stop"
=
pure
CsvStop
parseField
_
=
mzero
instance
ToField
CsvListType
where
toField
CsvMap
=
"map"
toField
CsvCandidate
=
"main"
toField
CsvStop
=
"stop"
------------------------------------------------------------------------
csvDecodeOptions
::
DecodeOptions
csvDecodeOptions
=
(
defaultDecodeOptions
{
decDelimiter
=
fromIntegral
$
ord
csvListFieldDelimiter
}
)
csvEncodeOptions
::
EncodeOptions
csvEncodeOptions
=
(
defaultEncodeOptions
{
encDelimiter
=
fromIntegral
$
ord
csvListFieldDelimiter
}
)
------------------------------------------------------------------------
fromCsvListFile
::
FilePath
->
IO
(
Header
,
Vector
CsvList
)
fromCsvListFile
fp
=
do
csvData
<-
BL
.
readFile
fp
case
decodeByNameWith
csvDecodeOptions
csvData
of
Left
e
->
panic
(
pack
e
)
Right
csvList
->
pure
csvList
------------------------------------------------------------------------
toCsvListFile
::
FilePath
->
(
Header
,
Vector
CsvList
)
->
IO
()
toCsvListFile
fp
(
h
,
vs
)
=
BL
.
writeFile
fp
$
encodeByNameWith
csvEncodeOptions
h
(
V
.
toList
vs
)
------------------------------------------------------------------------
src/Gargantext/Text/List/Types.hs
0 → 100644
View file @
cc31b225
{-|
Module : Gargantext.Text.List.Types
Description :
Copyright : (c) CNRS, 2018-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
CSV parser for Gargantext corpus files.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module
Gargantext.Text.List.Types
where
import
Prelude
(
Bounded
,
Enum
,
minBound
,
maxBound
)
import
Data.Text
(
Text
)
import
Data.Map
(
Map
,
empty
,
fromList
,
insert
,
lookup
)
import
Gargantext.Prelude
-------------------------------------------------------------------
type
Label
=
Text
data
ListType
=
Map
|
Stop
|
Candidate
deriving
(
Show
,
Eq
,
Ord
,
Enum
,
Bounded
)
type
Lists
=
Map
ListType
(
Map
Text
[
Text
])
emptyLists
::
Lists
emptyLists
=
fromList
$
map
(
\
lt
->
(
lt
,
empty
))
([
minBound
..
maxBound
]
::
[
ListType
])
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment