Commit cc31b225 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[SPECS] index a corpus with term list.

parent bd47a5e3
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
*.cabal *.cabal
*purescript-gargantext *purescript-gargantext
doc doc
bin
deps deps
profiling profiling
_darcs _darcs
{-|
Module : Main.hs
Description : Gargantext starter
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Main specifications to index a corpus with a term list
-}
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE StandaloneDeriving #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE Strict #-}
module Main where
import qualified Data.Vector as DV
import Gargantext.Prelude
import Data.Text (Text)
import Gargantext.Text.Parsers.CSV (readCsv)
import Gargantext.Text.List.CSV (fromCsvListFile)
main :: IO ()
main = do
[corpusfile, termListFile, outputFile] <- readParams
-- corpus :: [Text]
corpus <- DV.toList <$> map DV.csv_abstract <$> readCsv corpusFile
-- termListMap :: [Text]
termList <- termListMap <$> fromCsvListFile termListFile
let corpusIndexed = indexCorpusWith corpus termList
let cooc = cooccurrences corpusIndexed
writeFile outputFile cooc
...@@ -9,23 +9,29 @@ Portability : POSIX ...@@ -9,23 +9,29 @@ Portability : POSIX
Here is a longer description of this module, containing some Here is a longer description of this module, containing some
commentary with @some markup@. commentary with @some markup@.
-} -}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
module Gargantext.Text.Terms.Lists module Gargantext.Text.List
where where
--import Data.Maybe import Data.Text (Text)
--import Data.List (filter) import qualified Data.Text as DT
--import Gargantext.Text
import Gargantext.Prelude import Gargantext.Prelude
--
data ListName = Stop | Candidate | Graph -- | TODO normalize text
deriving (Show, Eq)
-- | TODO Order the seperators in probability of apparition
separators :: [Text]
separators = [" ", ",", ".", "?", "!", "\""]
isIn :: Text -> Text -> Bool
isIn term context = any (\x -> DT.isInfixOf x context)
$ map (\sep -> term <> sep) separators
------------------------------------------------------------------------
--graph :: [Ngrams] -> [Ngrams] --graph :: [Ngrams] -> [Ngrams]
--graph ngs = filter (\ng -> _ngramsListName ng == Just Graph) ngs --graph ngs = filter (\ng -> _ngramsListName ng == Just Graph) ngs
-- --
...@@ -34,4 +40,23 @@ data ListName = Stop | Candidate | Graph ...@@ -34,4 +40,23 @@ data ListName = Stop | Candidate | Graph
-- --
--stop :: [Ngrams] -> [Ngrams] --stop :: [Ngrams] -> [Ngrams]
--stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs --stop ngs = filter (\ng -> _ngramsListName ng == Just Stop) ngs
------------------------------------------------------------------------
-- | Attoparsec solution to index test
--import Data.Attoparsec.ByteString (Parser, parseOnly, try, string
-- , takeTill, take
-- , manyTill, many1)
--import Data.Attoparsec.ByteString.Char8 (anyChar, isEndOfLine)
--import Data.ByteString (ByteString, concat)
--import Data.ByteString.Char8 (pack)
--import Control.Applicative
-- | Attoparsec version
--indexParser :: (ByteString -> b) -> ByteString -> Parser b
--indexParser form2label x = do
-- _ <- manyTill anyChar (string x)
-- pure $ form2label x
--doIndex :: Applicative f => ByteString -> ByteString -> f (Either String [ByteString]
--doIndex f x txt = pure $ parseOnly (many $ indexParser f x) txt
------------------------------------------------------------------------
{-|
Module : Gargantext.Text.List.CSV
Description :
Copyright : (c) CNRS, 2018-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
CSV parser for Gargantext corpus files.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveGeneric #-}
module Gargantext.Text.List.CSV where
import GHC.Real (round)
import GHC.IO (FilePath)
import Control.Applicative
import Control.Monad (mzero)
import Data.Char (ord)
import Data.Csv
import Data.Either (Either(Left, Right))
import Data.Text (Text, pack, length, intercalate)
import qualified Data.ByteString.Lazy as BL
import Data.Vector (Vector)
import qualified Data.Vector as V
import Gargantext.Prelude hiding (length)
import Gargantext.Text.List.Types
------------------------------------------------------------------------
--csv2lists :: Vector CsvList -> Lists
--csv2lists v = V.foldl' (\e (CsvList listType label forms) -> insertLists lt label forms e) emptyLists v
------------------------------------------------------------------------
data CsvListType = CsvMap | CsvStop | CsvCandidate
deriving (Read, Show, Eq)
------------------------------------------------------------------------
-- CSV List Main Configuration
csvListFieldDelimiter :: Char
csvListFieldDelimiter = '\t'
csvListFormsDelimiter :: Text
csvListFormsDelimiter = "|&|"
------------------------------------------------------------------------
data CsvList = CsvList
{ csvList_status :: !CsvListType
, csvList_label :: !Text
, csvList_forms :: !Text
}
deriving (Show)
------------------------------------------------------------------------
instance FromNamedRecord CsvList where
parseNamedRecord r = CsvList <$> r .: "status"
<*> r .: "label"
<*> r .: "forms"
instance ToNamedRecord CsvList where
toNamedRecord (CsvList s l f) =
namedRecord [ "status" .= s
, "label" .= l
, "forms" .= f
]
------------------------------------------------------------------------
instance FromField CsvListType where
parseField "map" = pure CsvMap
parseField "main" = pure CsvCandidate
parseField "stop" = pure CsvStop
parseField _ = mzero
instance ToField CsvListType where
toField CsvMap = "map"
toField CsvCandidate = "main"
toField CsvStop = "stop"
------------------------------------------------------------------------
csvDecodeOptions :: DecodeOptions
csvDecodeOptions = (defaultDecodeOptions
{decDelimiter = fromIntegral $ ord csvListFieldDelimiter}
)
csvEncodeOptions :: EncodeOptions
csvEncodeOptions = ( defaultEncodeOptions
{encDelimiter = fromIntegral $ ord csvListFieldDelimiter}
)
------------------------------------------------------------------------
fromCsvListFile :: FilePath -> IO (Header, Vector CsvList)
fromCsvListFile fp = do
csvData <- BL.readFile fp
case decodeByNameWith csvDecodeOptions csvData of
Left e -> panic (pack e)
Right csvList -> pure csvList
------------------------------------------------------------------------
toCsvListFile :: FilePath -> (Header, Vector CsvList) -> IO ()
toCsvListFile fp (h, vs) = BL.writeFile fp $
encodeByNameWith csvEncodeOptions h (V.toList vs)
------------------------------------------------------------------------
{-|
Module : Gargantext.Text.List.Types
Description :
Copyright : (c) CNRS, 2018-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
CSV parser for Gargantext corpus files.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.List.Types where
import Prelude (Bounded, Enum, minBound, maxBound)
import Data.Text (Text)
import Data.Map (Map, empty, fromList, insert, lookup)
import Gargantext.Prelude
-------------------------------------------------------------------
type Label = Text
data ListType = Map | Stop | Candidate
deriving (Show, Eq, Ord, Enum, Bounded)
type Lists = Map ListType (Map Text [Text])
emptyLists :: Lists
emptyLists = fromList $ map (\lt -> (lt, empty))
([minBound..maxBound] :: [ListType])
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment