Commit 94a16978 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[CLI COOC] fix cooc behavior, next refacto and newtypes.

parent 7fe6bf9e
...@@ -22,17 +22,26 @@ Main specifications to index a corpus with a term list ...@@ -22,17 +22,26 @@ Main specifications to index a corpus with a term list
module Main where module Main where
import Data.ByteString.Lazy (writeFile)
import Data.Maybe (catMaybes) import Data.Maybe (catMaybes)
import Data.Text (pack) import Data.Text (pack)
import qualified Data.Text as DT
import Data.Tuple.Extra (both)
import qualified Data.Vector as DV import qualified Data.Vector as DV
import qualified Data.Maybe as DMaybe import qualified Data.Maybe as DMaybe
import Control.Monad (zipWithM) import Control.Monad (zipWithM)
import Control.Monad.IO.Class import Control.Monad.IO.Class
import qualified Data.IntMap as DM
import Data.Map (Map) import Data.Map (Map)
import qualified Data.IntMap as DIM
import qualified Data.Map as DM
import GHC.Generics
import Data.Aeson
import Data.Text (Text) import Data.Text (Text)
import Data.List (cycle, concat, unwords) import Data.List (cycle, concat, unwords)
import Data.List.Split (chunksOf) import Data.List.Split (chunksOf)
...@@ -51,44 +60,42 @@ import Gargantext.Text.Terms.WithList ...@@ -51,44 +60,42 @@ import Gargantext.Text.Terms.WithList
import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year) import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
import Gargantext.Text.List.CSV (csvGraphTermList) import Gargantext.Text.List.CSV (csvGraphTermList)
import Gargantext.Text.Terms (terms) import Gargantext.Text.Terms (terms)
import Gargantext.Text.Metrics.Count (coocOn, Coocs) import Gargantext.Text.Metrics.Count (coocOnContexts, Coocs)
mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b] ------------------------------------------------------------------------
mapMP f xs = do -- OUTPUT format
bs <- zipWithM g (cycle "-\\|/") xs
liftIO $ hPutStr stderr "\rDone\n"
pure bs
where
g c x = do
liftIO $ hPutStr stderr ['\r',c]
liftIO $ hFlush stderr
f x
mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b] data CoocByYear = CoocByYear { year :: Int
mapConcurrentlyChunked f ts = do , coocurrences :: Map (Text, Text) Coocs
caps <- getNumCapabilities } deriving (Show, Generic)
let n = 1 `max` (length ts `div` caps)
concat <$> mapConcurrently (mapM f) (chunksOf n ts) data CoocByYears = CoocByYears { years :: [CoocByYear] }
deriving (Show, Generic)
instance ToJSON CoocByYear
instance ToJSON CoocByYears
------------------------------------------------------------------------
filterTermsAndCooc filterTermsAndCooc
:: TermType Lang :: Patterns
-> (Int, [Text]) -> (Int, [Text])
-> IO (Map (Terms, Terms) Coocs) -> IO CoocByYear -- (Int, (Map (Text, Text) Coocs))
filterTermsAndCooc patterns (year, ts) = do filterTermsAndCooc patterns (year, ts) = do
log "start" log "start"
r <- coocOn identity <$> mapM (\x -> {-log "work" >>-} terms patterns x) ts r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts
log "stop" log "stop"
pure r pure $ CoocByYear year (DM.mapKeys (both DT.unwords) r)
where where
log m = do log m = do
tid <- myThreadId tid <- myThreadId
(p, _) <- threadCapability tid (p, _) <- threadCapability tid
putStrLn . unwords $ putStrLn . unwords $
["filterTermsAndCooc:", m, show year, "on proc", show p] ["filterTermsAndCooc:", m, show year, "on proc", show p]
--main :: IO [()] main :: IO ()
main = do main = do
[corpusFile, termListFile, _] <- getArgs [corpusFile, termListFile, outputFile] <- getArgs
--corpus :: IO (DM.IntMap [[Text]]) --corpus :: IO (DM.IntMap [[Text]])
corpus <- DM.fromListWith (<>) corpus <- DM.fromListWith (<>)
...@@ -102,18 +109,44 @@ main = do ...@@ -102,18 +109,44 @@ main = do
putStrLn $ show $ length termList putStrLn $ show $ length termList
let patterns = WithList $ buildPatterns termList let patterns = buildPatterns termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus) -- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus) r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
putStrLn $ show r writeFile outputFile $ encode (CoocByYears r)
--writeFile outputFile cooc
------------------------------------------------------------------------
-- | Tools
mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
mapMP f xs = do
bs <- zipWithM g (cycle "-\\|/") xs
liftIO $ hPutStr stderr "\rDone\n"
pure bs
where
g c x = do
liftIO $ hPutStr stderr ['\r',c]
liftIO $ hFlush stderr
f x
-- | Optimi that need further developments (not used yet)
mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
mapConcurrentlyChunked f ts = do
caps <- getNumCapabilities
let n = 1 `max` (length ts `div` caps)
concat <$> mapConcurrently (mapM f) (chunksOf n ts)
--terms' :: Patterns -> Text -> Corpus [[Text]]
terms' pats txt = pure $ concat $ extractTermsWithList pats txt
testCooc = do -- | TODO Minimal Example
let patterns = WithList $ buildPatterns testTermList --testCooc = do
mapM (\x -> {-log "work" >>-} terms patterns x) $ catMaybes $ map (head . snd) testCorpus -- let patterns = buildPatterns testTermList
--mapConcurrently (filterTermsAndCooc patterns) testCorpus -- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
-- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
testCorpus :: [(Int, [Text])] testCorpus :: [(Int, [Text])]
......
...@@ -160,8 +160,10 @@ executables: ...@@ -160,8 +160,10 @@ executables:
- -O2 - -O2
- -Wmissing-signatures - -Wmissing-signatures
dependencies: dependencies:
- aeson
- async - async
- base - base
- bytestring
- containers - containers
- gargantext - gargantext
- vector - vector
......
...@@ -73,10 +73,10 @@ type Occs = Int ...@@ -73,10 +73,10 @@ type Occs = Int
type Coocs = Int type Coocs = Int
type Threshold = Int type Threshold = Int
removeApax :: Threshold -> Map (Label, Label) Int -> Map (Label, Label) Int removeApax :: Threshold -> Map ([Text], [Text]) Int -> Map ([Text], [Text]) Int
removeApax t = DMS.filter (> t) removeApax t = DMS.filter (> t)
cooc :: [[Terms]] -> Map (Label, Label) Int cooc :: [[Terms]] -> Map ([Text], [Text]) Int
cooc tss = coocOnWithLabel _terms_stem (useLabelPolicy label_policy) tss cooc tss = coocOnWithLabel _terms_stem (useLabelPolicy label_policy) tss
where where
terms_occs = occurrencesOn _terms_stem (List.concat tss) terms_occs = occurrencesOn _terms_stem (List.concat tss)
...@@ -91,12 +91,12 @@ coocOnWithLabel on' policy tss = mapKeys (delta policy) $ coocOn on' tss ...@@ -91,12 +91,12 @@ coocOnWithLabel on' policy tss = mapKeys (delta policy) $ coocOn on' tss
delta f = f *** f delta f = f *** f
mkLabelPolicy :: Map Grouped (Map Terms Occs) -> Map Grouped Label mkLabelPolicy :: Map Grouped (Map Terms Occs) -> Map Grouped [Text]
mkLabelPolicy = DMS.map f where mkLabelPolicy = DMS.map f where
f = _terms_label . fst . maximumWith snd . DMS.toList f = _terms_label . fst . maximumWith snd . DMS.toList
-- TODO use the Foldable instance of Map instead of building a list -- TODO use the Foldable instance of Map instead of building a list
useLabelPolicy :: Map Grouped Label -> Grouped -> Label useLabelPolicy :: Map Grouped [Text] -> Grouped -> [Text]
useLabelPolicy m g = case DMS.lookup g m of useLabelPolicy m g = case DMS.lookup g m of
Just label -> label Just label -> label
Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g) Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
...@@ -120,6 +120,8 @@ coocOn' fun ts = DMS.fromListWith (+) xs ...@@ -120,6 +120,8 @@ coocOn' fun ts = DMS.fromListWith (+) xs
, x >= y , x >= y
] ]
------------------------------------------------------------------------
coocOnContexts :: (a -> [Text]) -> [[a]] -> Map ([Text], [Text]) Int coocOnContexts :: (a -> [Text]) -> [[a]] -> Map ([Text], [Text]) Int
coocOnContexts fun = DMS.fromListWith (+) . List.concat . map (coocOnSingleContext fun) coocOnContexts fun = DMS.fromListWith (+) . List.concat . map (coocOnSingleContext fun)
...@@ -132,6 +134,7 @@ coocOnSingleContext fun ts = xs ...@@ -132,6 +134,7 @@ coocOnSingleContext fun ts = xs
, y <- ts' , y <- ts'
, x >= y , x >= y
] ]
------------------------------------------------------------------------
-- | Compute the grouped occurrences (occ) -- | Compute the grouped occurrences (occ)
......
...@@ -33,7 +33,6 @@ compute graph ...@@ -33,7 +33,6 @@ compute graph
module Gargantext.Text.Terms module Gargantext.Text.Terms
where where
import Data.List (concat)
import Data.Text (Text) import Data.Text (Text)
import Data.Traversable import Data.Traversable
...@@ -42,10 +41,9 @@ import Gargantext.Core ...@@ -42,10 +41,9 @@ import Gargantext.Core
import Gargantext.Core.Types import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi (multiterms) import Gargantext.Text.Terms.Multi (multiterms)
import Gargantext.Text.Terms.Mono (monoTerms) import Gargantext.Text.Terms.Mono (monoTerms)
import Gargantext.Text.Terms.WithList (Patterns, extractTermsWithList)
data TermType lang = Mono lang | Multi lang | MonoMulti lang | WithList Patterns data TermType lang = Mono lang | Multi lang | MonoMulti lang
group :: [Text] -> [Text] group :: [Text] -> [Text]
...@@ -68,6 +66,6 @@ terms :: TermType Lang -> Text -> IO [Terms] ...@@ -68,6 +66,6 @@ terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono lang) txt = pure $ monoTerms lang txt terms (Mono lang) txt = pure $ monoTerms lang txt
terms (Multi lang) txt = multiterms lang txt terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt terms (MonoMulti lang) txt = terms (Multi lang) txt
terms (WithList list) txt = pure . concat $ extractTermsWithList list txt -- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------ ------------------------------------------------------------------------
...@@ -21,7 +21,6 @@ import qualified Data.Algorithms.KMP as KMP ...@@ -21,7 +21,6 @@ import qualified Data.Algorithms.KMP as KMP
import Data.Text (Text) import Data.Text (Text)
import qualified Data.IntMap.Strict as IntMap import qualified Data.IntMap.Strict as IntMap
import Gargantext.Core.Types (Terms(..))
import Gargantext.Text.Context import Gargantext.Text.Context
import Gargantext.Text.Terms.Mono (monoTextsBySentence) import Gargantext.Text.Terms.Mono (monoTextsBySentence)
...@@ -29,21 +28,20 @@ import Prelude (error) ...@@ -29,21 +28,20 @@ import Prelude (error)
import Gargantext.Prelude import Gargantext.Prelude
import Data.List (null, concatMap) import Data.List (null, concatMap)
import Data.Ord import Data.Ord
import qualified Data.Set as Set
------------------------------------------------------------------------ ------------------------------------------------------------------------
data Pattern = Pattern data Pattern = Pattern
{ _pat_table :: !(KMP.Table Term) { _pat_table :: !(KMP.Table Text)
, _pat_length :: !Int , _pat_length :: !Int
, _pat_terms :: !Terms , _pat_terms :: ![Text]
} }
type Patterns = [Pattern] type Patterns = [Pattern]
------------------------------------------------------------------------ ------------------------------------------------------------------------
replaceTerms :: Patterns -> Sentence Term -> Sentence Terms replaceTerms :: Patterns -> [Text] -> [[Text]]
replaceTerms pats terms = go 0 replaceTerms pats terms = go 0
where where
terms_len = length terms terms_len = length terms
...@@ -72,8 +70,8 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern ...@@ -72,8 +70,8 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
f alt | "" `elem` alt = error "buildPatterns: ERR1" f alt | "" `elem` alt = error "buildPatterns: ERR1"
| null alt = error "buildPatterns: ERR2" | null alt = error "buildPatterns: ERR2"
| otherwise = | otherwise =
Pattern (KMP.build alt) (length alt) Pattern (KMP.build alt) (length alt) label
(Terms label $ Set.empty) -- TODO check stems --(Terms label $ Set.empty) -- TODO check stems
extractTermsWithList :: Patterns -> Text -> Corpus Terms extractTermsWithList :: Patterns -> Text -> Corpus [Text]
extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment