Commit 94a16978 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[CLI COOC] fix cooc behavior, next refacto and newtypes.

parent 7fe6bf9e
......@@ -22,17 +22,26 @@ Main specifications to index a corpus with a term list
module Main where
import Data.ByteString.Lazy (writeFile)
import Data.Maybe (catMaybes)
import Data.Text (pack)
import qualified Data.Text as DT
import Data.Tuple.Extra (both)
import qualified Data.Vector as DV
import qualified Data.Maybe as DMaybe
import Control.Monad (zipWithM)
import Control.Monad.IO.Class
import qualified Data.IntMap as DM
import Data.Map (Map)
import qualified Data.IntMap as DIM
import qualified Data.Map as DM
import GHC.Generics
import Data.Aeson
import Data.Text (Text)
import Data.List (cycle, concat, unwords)
import Data.List.Split (chunksOf)
......@@ -51,44 +60,42 @@ import Gargantext.Text.Terms.WithList
import Gargantext.Text.Parsers.CSV (readCsv, csv_title, csv_abstract, csv_publication_year)
import Gargantext.Text.List.CSV (csvGraphTermList)
import Gargantext.Text.Terms (terms)
import Gargantext.Text.Metrics.Count (coocOn, Coocs)
import Gargantext.Text.Metrics.Count (coocOnContexts, Coocs)
mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
mapMP f xs = do
bs <- zipWithM g (cycle "-\\|/") xs
liftIO $ hPutStr stderr "\rDone\n"
pure bs
where
g c x = do
liftIO $ hPutStr stderr ['\r',c]
liftIO $ hFlush stderr
f x
------------------------------------------------------------------------
-- OUTPUT format
mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
mapConcurrentlyChunked f ts = do
caps <- getNumCapabilities
let n = 1 `max` (length ts `div` caps)
concat <$> mapConcurrently (mapM f) (chunksOf n ts)
data CoocByYear = CoocByYear { year :: Int
, coocurrences :: Map (Text, Text) Coocs
} deriving (Show, Generic)
data CoocByYears = CoocByYears { years :: [CoocByYear] }
deriving (Show, Generic)
instance ToJSON CoocByYear
instance ToJSON CoocByYears
------------------------------------------------------------------------
filterTermsAndCooc
:: TermType Lang
:: Patterns
-> (Int, [Text])
-> IO (Map (Terms, Terms) Coocs)
-> IO CoocByYear -- (Int, (Map (Text, Text) Coocs))
filterTermsAndCooc patterns (year, ts) = do
log "start"
r <- coocOn identity <$> mapM (\x -> {-log "work" >>-} terms patterns x) ts
r <- coocOnContexts identity <$> mapM (\x -> {-log "work" >>-} terms' patterns x) ts
log "stop"
pure r
pure $ CoocByYear year (DM.mapKeys (both DT.unwords) r)
where
log m = do
tid <- myThreadId
(p, _) <- threadCapability tid
putStrLn . unwords $
["filterTermsAndCooc:", m, show year, "on proc", show p]
--main :: IO [()]
main :: IO ()
main = do
[corpusFile, termListFile, _] <- getArgs
[corpusFile, termListFile, outputFile] <- getArgs
--corpus :: IO (DM.IntMap [[Text]])
corpus <- DM.fromListWith (<>)
......@@ -102,18 +109,44 @@ main = do
putStrLn $ show $ length termList
let patterns = WithList $ buildPatterns termList
let patterns = buildPatterns termList
-- r <- mapConcurrentlyChunked (filterTermsAndCooc patterns) (DM.toList corpus)
r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
putStrLn $ show r
--writeFile outputFile cooc
r <- mapConcurrently (filterTermsAndCooc patterns) (DM.toList corpus)
writeFile outputFile $ encode (CoocByYears r)
------------------------------------------------------------------------
-- | Tools
mapMP :: MonadIO m => (a -> m b) -> [a] -> m [b]
mapMP f xs = do
bs <- zipWithM g (cycle "-\\|/") xs
liftIO $ hPutStr stderr "\rDone\n"
pure bs
where
g c x = do
liftIO $ hPutStr stderr ['\r',c]
liftIO $ hFlush stderr
f x
-- | Optimi that need further developments (not used yet)
mapConcurrentlyChunked :: (a -> IO b) -> [a] -> IO [b]
mapConcurrentlyChunked f ts = do
caps <- getNumCapabilities
let n = 1 `max` (length ts `div` caps)
concat <$> mapConcurrently (mapM f) (chunksOf n ts)
--terms' :: Patterns -> Text -> Corpus [[Text]]
terms' pats txt = pure $ concat $ extractTermsWithList pats txt
testCooc = do
let patterns = WithList $ buildPatterns testTermList
mapM (\x -> {-log "work" >>-} terms patterns x) $ catMaybes $ map (head . snd) testCorpus
--mapConcurrently (filterTermsAndCooc patterns) testCorpus
-- | TODO Minimal Example
--testCooc = do
-- let patterns = buildPatterns testTermList
-- mapM (\x -> {-log "work" >>-} terms' patterns x) $ catMaybes $ map (head . snd) testCorpus
-- --mapConcurrently (filterTermsAndCooc patterns) testCorpus
testCorpus :: [(Int, [Text])]
......
......@@ -160,8 +160,10 @@ executables:
- -O2
- -Wmissing-signatures
dependencies:
- aeson
- async
- base
- bytestring
- containers
- gargantext
- vector
......
......@@ -73,10 +73,10 @@ type Occs = Int
type Coocs = Int
type Threshold = Int
removeApax :: Threshold -> Map (Label, Label) Int -> Map (Label, Label) Int
removeApax :: Threshold -> Map ([Text], [Text]) Int -> Map ([Text], [Text]) Int
removeApax t = DMS.filter (> t)
cooc :: [[Terms]] -> Map (Label, Label) Int
cooc :: [[Terms]] -> Map ([Text], [Text]) Int
cooc tss = coocOnWithLabel _terms_stem (useLabelPolicy label_policy) tss
where
terms_occs = occurrencesOn _terms_stem (List.concat tss)
......@@ -91,12 +91,12 @@ coocOnWithLabel on' policy tss = mapKeys (delta policy) $ coocOn on' tss
delta f = f *** f
mkLabelPolicy :: Map Grouped (Map Terms Occs) -> Map Grouped Label
mkLabelPolicy :: Map Grouped (Map Terms Occs) -> Map Grouped [Text]
mkLabelPolicy = DMS.map f where
f = _terms_label . fst . maximumWith snd . DMS.toList
-- TODO use the Foldable instance of Map instead of building a list
useLabelPolicy :: Map Grouped Label -> Grouped -> Label
useLabelPolicy :: Map Grouped [Text] -> Grouped -> [Text]
useLabelPolicy m g = case DMS.lookup g m of
Just label -> label
Nothing -> panic $ "Label of Grouped not found: " <> (pack $ show g)
......@@ -120,6 +120,8 @@ coocOn' fun ts = DMS.fromListWith (+) xs
, x >= y
]
------------------------------------------------------------------------
coocOnContexts :: (a -> [Text]) -> [[a]] -> Map ([Text], [Text]) Int
coocOnContexts fun = DMS.fromListWith (+) . List.concat . map (coocOnSingleContext fun)
......@@ -132,6 +134,7 @@ coocOnSingleContext fun ts = xs
, y <- ts'
, x >= y
]
------------------------------------------------------------------------
-- | Compute the grouped occurrences (occ)
......
......@@ -33,7 +33,6 @@ compute graph
module Gargantext.Text.Terms
where
import Data.List (concat)
import Data.Text (Text)
import Data.Traversable
......@@ -42,10 +41,9 @@ import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi (multiterms)
import Gargantext.Text.Terms.Mono (monoTerms)
import Gargantext.Text.Terms.WithList (Patterns, extractTermsWithList)
data TermType lang = Mono lang | Multi lang | MonoMulti lang | WithList Patterns
data TermType lang = Mono lang | Multi lang | MonoMulti lang
group :: [Text] -> [Text]
......@@ -68,6 +66,6 @@ terms :: TermType Lang -> Text -> IO [Terms]
terms (Mono lang) txt = pure $ monoTerms lang txt
terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt
terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------
......@@ -21,7 +21,6 @@ import qualified Data.Algorithms.KMP as KMP
import Data.Text (Text)
import qualified Data.IntMap.Strict as IntMap
import Gargantext.Core.Types (Terms(..))
import Gargantext.Text.Context
import Gargantext.Text.Terms.Mono (monoTextsBySentence)
......@@ -29,21 +28,20 @@ import Prelude (error)
import Gargantext.Prelude
import Data.List (null, concatMap)
import Data.Ord
import qualified Data.Set as Set
------------------------------------------------------------------------
data Pattern = Pattern
{ _pat_table :: !(KMP.Table Term)
{ _pat_table :: !(KMP.Table Text)
, _pat_length :: !Int
, _pat_terms :: !Terms
, _pat_terms :: ![Text]
}
type Patterns = [Pattern]
------------------------------------------------------------------------
replaceTerms :: Patterns -> Sentence Term -> Sentence Terms
replaceTerms :: Patterns -> [Text] -> [[Text]]
replaceTerms pats terms = go 0
where
terms_len = length terms
......@@ -72,8 +70,8 @@ buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
f alt | "" `elem` alt = error "buildPatterns: ERR1"
| null alt = error "buildPatterns: ERR2"
| otherwise =
Pattern (KMP.build alt) (length alt)
(Terms label $ Set.empty) -- TODO check stems
Pattern (KMP.build alt) (length alt) label
--(Terms label $ Set.empty) -- TODO check stems
extractTermsWithList :: Patterns -> Text -> Corpus Terms
extractTermsWithList :: Patterns -> Text -> Corpus [Text]
extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment