Fix splitting and show progress

parent f33b0c12
...@@ -24,7 +24,11 @@ module Main where ...@@ -24,7 +24,11 @@ module Main where
import qualified Data.Vector as DV import qualified Data.Vector as DV
import Control.Monad (zipWithM)
import Control.Monad.IO.Class
import Data.Text (Text) import Data.Text (Text)
import Data.List (cycle)
import System.IO (hPutStr, hFlush, stderr)
import System.Environment import System.Environment
--import Control.Concurrent.Async as CCA (mapConcurrently) --import Control.Concurrent.Async as CCA (mapConcurrently)
...@@ -37,6 +41,17 @@ import Gargantext.Text.List.CSV (csvGraphTermList) ...@@ -37,6 +41,17 @@ import Gargantext.Text.List.CSV (csvGraphTermList)
import Gargantext.Text.Terms (terms) import Gargantext.Text.Terms (terms)
import Gargantext.Text.Metrics.Count (cooc) import Gargantext.Text.Metrics.Count (cooc)
mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
mapMP f xs = do
bs <- zipWithM g (cycle "-\\|/") xs
liftIO $ hPutStr stderr "\rDone\n"
pure bs
where
g c x = do
liftIO $ hPutStr stderr ['\r',c]
liftIO $ hFlush stderr
f x
main :: IO () main :: IO ()
main = do main = do
[corpusFile, termListFile, outputFile] <- getArgs [corpusFile, termListFile, outputFile] <- getArgs
...@@ -53,9 +68,8 @@ main = do ...@@ -53,9 +68,8 @@ main = do
putStrLn $ show $ length termList putStrLn $ show $ length termList
let patterns = WithList $ buildPatterns termList let patterns = WithList $ buildPatterns termList
corpusIndexed <- mapM (terms patterns) corpus corpusIndexed <- mapMP (terms patterns) corpus
mapM (putStrLn . show) corpusIndexed
putStrLn $ show corpusIndexed
let myCooc = cooc corpusIndexed let myCooc = cooc corpusIndexed
putStrLn $ show myCooc putStrLn $ show myCooc
......
...@@ -25,6 +25,7 @@ import Control.Monad (mzero) ...@@ -25,6 +25,7 @@ import Control.Monad (mzero)
import Data.Char (ord) import Data.Char (ord)
import Data.Csv import Data.Csv
import Data.Either (Either(Left, Right)) import Data.Either (Either(Left, Right))
import Data.List (null)
import Data.Text (Text, pack) import Data.Text (Text, pack)
import qualified Data.Text as DT import qualified Data.Text as DT
import qualified Data.ByteString.Lazy as BL import qualified Data.ByteString.Lazy as BL
...@@ -42,7 +43,7 @@ csvGraphTermList fp = csv2list CsvMap <$> snd <$> fromCsvListFile fp ...@@ -42,7 +43,7 @@ csvGraphTermList fp = csv2list CsvMap <$> snd <$> fromCsvListFile fp
csv2list :: CsvListType -> Vector CsvList -> TermList csv2list :: CsvListType -> Vector CsvList -> TermList
csv2list lt vs = V.toList $ V.map (\(CsvList _ label forms) csv2list lt vs = V.toList $ V.map (\(CsvList _ label forms)
-> (DT.words label, map DT.words $ DT.splitOn csvListFormsDelimiter forms)) -> (DT.words label, filter (not . null) . map DT.words $ DT.splitOn csvListFormsDelimiter forms))
$ V.filter (\l -> csvList_status l == lt ) vs $ V.filter (\l -> csvList_status l == lt ) vs
------------------------------------------------------------------------ ------------------------------------------------------------------------
......
...@@ -17,7 +17,6 @@ module Gargantext.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence) ...@@ -17,7 +17,6 @@ module Gargantext.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence)
where where
import Prelude (String) import Prelude (String)
import Data.Char (isSpace)
import Data.Text (Text) import Data.Text (Text)
import qualified Data.Text as T import qualified Data.Text as T
...@@ -50,7 +49,7 @@ monoText2term :: Lang -> Text -> Terms ...@@ -50,7 +49,7 @@ monoText2term :: Lang -> Text -> Terms
monoText2term lang txt = Terms [txt] (S.singleton $ stem lang txt) monoText2term lang txt = Terms [txt] (S.singleton $ stem lang txt)
monoTextsBySentence :: Text -> [[Text]] monoTextsBySentence :: Text -> [[Text]]
monoTextsBySentence = map (T.split isSpace) monoTextsBySentence = map T.words
. T.split isSep . T.split isSep
. T.toLower . T.toLower
......
...@@ -12,6 +12,7 @@ commentary with @some markup@. ...@@ -12,6 +12,7 @@ commentary with @some markup@.
-} -}
{-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE BangPatterns #-} {-# LANGUAGE BangPatterns #-}
module Gargantext.Text.Terms.WithList where module Gargantext.Text.Terms.WithList where
...@@ -24,8 +25,9 @@ import Gargantext.Core.Types (Terms(..)) ...@@ -24,8 +25,9 @@ import Gargantext.Core.Types (Terms(..))
import Gargantext.Text.Context import Gargantext.Text.Context
import Gargantext.Text.Terms.Mono (monoTextsBySentence) import Gargantext.Text.Terms.Mono (monoTextsBySentence)
import Prelude (error)
import Gargantext.Prelude import Gargantext.Prelude
import Data.List (concatMap) import Data.List (null, concatMap)
import Data.Ord import Data.Ord
import qualified Data.Set as Set import qualified Data.Set as Set
...@@ -67,7 +69,10 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern ...@@ -67,7 +69,10 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
where where
buildPattern (label, alts) = map f (label : alts) buildPattern (label, alts) = map f (label : alts)
where where
f alt = Pattern (KMP.build alt) (length alt) f alt | "" `elem` alt = error "buildPatterns: ERR1"
| null alt = error "buildPatterns: ERR2"
| otherwise =
Pattern (KMP.build alt) (length alt)
(Terms label $ Set.empty) -- TODO check stems (Terms label $ Set.empty) -- TODO check stems
extractTermsWithList :: Patterns -> Text -> Corpus Terms extractTermsWithList :: Patterns -> Text -> Corpus Terms
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment