Fix splitting and show progress

parent f33b0c12
......@@ -24,7 +24,11 @@ module Main where
import qualified Data.Vector as DV
import Control.Monad (zipWithM)
import Control.Monad.IO.Class
import Data.Text (Text)
import Data.List (cycle)
import System.IO (hPutStr, hFlush, stderr)
import System.Environment
--import Control.Concurrent.Async as CCA (mapConcurrently)
......@@ -37,6 +41,17 @@ import Gargantext.Text.List.CSV (csvGraphTermList)
import Gargantext.Text.Terms (terms)
import Gargantext.Text.Metrics.Count (cooc)
mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
mapMP f xs = do
bs <- zipWithM g (cycle "-\\|/") xs
liftIO $ hPutStr stderr "\rDone\n"
pure bs
where
g c x = do
liftIO $ hPutStr stderr ['\r',c]
liftIO $ hFlush stderr
f x
main :: IO ()
main = do
[corpusFile, termListFile, outputFile] <- getArgs
......@@ -53,9 +68,8 @@ main = do
putStrLn $ show $ length termList
let patterns = WithList $ buildPatterns termList
corpusIndexed <- mapM (terms patterns) corpus
putStrLn $ show corpusIndexed
corpusIndexed <- mapMP (terms patterns) corpus
mapM (putStrLn . show) corpusIndexed
let myCooc = cooc corpusIndexed
putStrLn $ show myCooc
......
......@@ -25,6 +25,7 @@ import Control.Monad (mzero)
import Data.Char (ord)
import Data.Csv
import Data.Either (Either(Left, Right))
import Data.List (null)
import Data.Text (Text, pack)
import qualified Data.Text as DT
import qualified Data.ByteString.Lazy as BL
......@@ -42,7 +43,7 @@ csvGraphTermList fp = csv2list CsvMap <$> snd <$> fromCsvListFile fp
csv2list :: CsvListType -> Vector CsvList -> TermList
csv2list lt vs = V.toList $ V.map (\(CsvList _ label forms)
-> (DT.words label, map DT.words $ DT.splitOn csvListFormsDelimiter forms))
-> (DT.words label, filter (not . null) . map DT.words $ DT.splitOn csvListFormsDelimiter forms))
$ V.filter (\l -> csvList_status l == lt ) vs
------------------------------------------------------------------------
......
......@@ -17,7 +17,6 @@ module Gargantext.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence)
where
import Prelude (String)
import Data.Char (isSpace)
import Data.Text (Text)
import qualified Data.Text as T
......@@ -50,7 +49,7 @@ monoText2term :: Lang -> Text -> Terms
monoText2term lang txt = Terms [txt] (S.singleton $ stem lang txt)
monoTextsBySentence :: Text -> [[Text]]
monoTextsBySentence = map (T.split isSpace)
monoTextsBySentence = map T.words
. T.split isSep
. T.toLower
......
......@@ -12,6 +12,7 @@ commentary with @some markup@.
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BangPatterns #-}
module Gargantext.Text.Terms.WithList where
......@@ -24,8 +25,9 @@ import Gargantext.Core.Types (Terms(..))
import Gargantext.Text.Context
import Gargantext.Text.Terms.Mono (monoTextsBySentence)
import Prelude (error)
import Gargantext.Prelude
import Data.List (concatMap)
import Data.List (null, concatMap)
import Data.Ord
import qualified Data.Set as Set
......@@ -67,7 +69,10 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
where
buildPattern (label, alts) = map f (label : alts)
where
f alt = Pattern (KMP.build alt) (length alt)
f alt | "" `elem` alt = error "buildPatterns: ERR1"
| null alt = error "buildPatterns: ERR2"
| otherwise =
Pattern (KMP.build alt) (length alt)
(Terms label $ Set.empty) -- TODO check stems
extractTermsWithList :: Patterns -> Text -> Corpus Terms
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment