Commit 4bf4c2d6 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[FEAT] New ngrams list + fix isidore api.

parent 7e77066f
......@@ -85,13 +85,19 @@ groupNodesByNgrams syn occs = Map.fromListWith (<>) occs'
data Diagonal = Diagonal Bool
getCoocByNgrams :: Diagonal -> Map Text (Set NodeId) -> Map (Text, Text) Int
getCoocByNgrams (Diagonal diag) m =
getCoocByNgrams = getCoocByNgrams' identity
getCoocByNgrams' :: (Ord a, Ord c) => (b -> Set c) -> Diagonal -> Map a b -> Map (a, a) Int
getCoocByNgrams' f (Diagonal diag) m =
Map.fromList [((t1,t2)
,maybe 0 Set.size $ Set.intersection
<$> Map.lookup t1 m
<*> Map.lookup t2 m
<$> (fmap f $ Map.lookup t1 m)
<*> (fmap f $ Map.lookup t2 m)
) | (t1,t2) <- case diag of
True -> [ (x,y) | x <- Map.keys m, y <- Map.keys m, x <= y]
False -> listToCombi identity (Map.keys m)
]
......@@ -31,16 +31,16 @@ import Gargantext.Database.Types.Node (ListId, CorpusId{-, HyperdataCorpus-})
--import Gargantext.Database.Flow (getOrMkRootWithCorpus)
import Gargantext.Database.Config (userMaster)
import Gargantext.Prelude
import Gargantext.Text.Metrics (scored, Scored(..), localMetrics{-, toScored-})
import Gargantext.Text.Metrics (scored, Scored(..), {-localMetrics, toScored-})
import qualified Data.Map as Map
import qualified Data.Vector.Storable as Vec
--import qualified Data.Vector.Storable as Vec
getMetrics :: FlowCmdM env err m
=> CorpusId -> Maybe ListId -> TabType -> Maybe Limit
-> m (Map Text (ListType, Maybe Text), [Scored Text])
getMetrics cId maybeListId tabType maybeLimit = do
(ngs, _, myCooc) <- getNgramsCooc cId maybeListId tabType maybeLimit
(ngs, _, myCooc) <- getNgramsCooc cId maybeListId tabType maybeLimit
pure (ngs, scored myCooc)
......@@ -59,7 +59,6 @@ getMetrics cId maybeListId tabType maybeLimit = do
metrics' <- getTficfWith cId masterCorpusId (lIds <> [lId]) (ngramsTypeFromTabType tabType) ngs'
pure (ngs , toScored [metrics, Map.fromList $ map (\(a,b) -> (a, Vec.fromList [fst b])) $ Map.toList metrics'])
-}
getLocalMetrics :: (FlowCmdM env err m)
=> CorpusId -> Maybe ListId -> TabType -> Maybe Limit
......@@ -70,6 +69,7 @@ getLocalMetrics :: (FlowCmdM env err m)
getLocalMetrics cId maybeListId tabType maybeLimit = do
(ngs, ngs', myCooc) <- getNgramsCooc cId maybeListId tabType maybeLimit
pure (ngs, ngs', localMetrics myCooc)
-}
getNgramsCooc :: (FlowCmdM env err m)
......
......@@ -16,16 +16,20 @@ Portability : POSIX
module Gargantext.Text.List
where
import Data.Either (partitionEithers, Either(..))
import Debug.Trace (trace)
import Data.Map (Map)
import Data.Set (Set)
import Data.Text (Text)
import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, NodeId)
import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Database.Utils (Cmd)
import Gargantext.Text.List.Learn (Model(..))
import Gargantext.Text.Metrics (takeScored)
import Gargantext.Prelude
--import Gargantext.Text.Terms (TermType(..))
import qualified Data.Char as Char
......@@ -41,6 +45,13 @@ data NgramsListBuilder = BuilderStepO { stemSize :: Int
}
| BuilderStep1 { withModel :: Model }
| BuilderStepN { withModel :: Model }
| Tficf { nlb_lang :: Lang
, nlb_group1 :: Int
, nlb_group2 :: Int
, nlb_stopSize :: StopSize
, nlb_userCorpusId :: UserCorpusId
, nlb_masterCorpusId :: MasterCorpusId
}
data StopSize = StopSize {unStopSize :: Int}
......@@ -50,6 +61,7 @@ buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorp
-> Cmd err (Map NgramsType [NgramsElement])
buildNgramsLists l n m s uCid mCid = do
ngTerms <- buildNgramsTermsList l n m s uCid mCid
--ngTerms <- buildNgramsTermsList' uCid (ngramsGroup l n m) (isStopTerm s . fst) 550 300
othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
pure $ Map.unions $ othersTerms <> [ngTerms]
......@@ -69,11 +81,54 @@ buildNgramsOthersList uCid groupIt nt = do
)
]
--{-
buildNgramsTermsList' :: UserCorpusId
-> (Text -> Text)
-> ((Text, (Set Text, Set NodeId)) -> Bool) -> Int -> Int
-> Cmd err (Map NgramsType [NgramsElement])
--}
buildNgramsTermsList' uCid groupIt stop gls is = do
ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
let
(stops, candidates) = partitionEithers
$ map (\t -> if stop t then Left t else Right t)
$ Map.toList
$ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
(maps, candidates') = takeScored gls is
$ getCoocByNgrams' snd (Diagonal True)
$ Map.fromList candidates
toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
(s,c,m) = (stops
, List.filter (\(k,_) -> List.elem k candidates') candidates
, List.filter (\(k,_) -> List.elem k maps) candidates
)
let ngs' = List.concat
$ map toNgramsElement
$ map (\t -> (StopTerm, toList' t)) s
<> map (\t -> (CandidateTerm, toList' t)) c
<> map (\t -> (GraphTerm, toList' t)) m
pure $ Map.fromList [(NgramsTerms, ngs')]
buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
-> Cmd err (Map NgramsType [NgramsElement])
buildNgramsTermsList l n m s uCid mCid = do
candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m)
let termList = toTermList ((isStopTerm s) . fst) candidates
let
candidatesSize = 2000
a = 500
b = 500
candidatesHead = List.take candidatesSize candidates
candidatesTail = List.drop candidatesSize candidates
termList = (toTermList a b ((isStopTerm s) . fst) candidatesHead)
<> (map (toList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
let ngs = List.concat $ map toNgramsElement termList
pure $ Map.fromList [(NgramsTerms, ngs)]
......@@ -94,24 +149,26 @@ toNgramsElement (listType, (_stem, (_score, setNgrams))) =
(mSetFromList [])
) children
-- TODO remove hard coded parameters
toTermList :: (a -> Bool) -> [a] -> [(ListType, a)]
toTermList stop ns = map (toTermList' stop CandidateTerm) xs
<> map (toTermList' stop GraphTerm) ys
<> map (toTermList' stop CandidateTerm) zs
where
toTermList' stop' l n = case stop' n of
True -> (StopTerm, n)
False -> (l, n)
-- TODO use % of size of list
-- TODO user ML
toList :: (b -> Bool) -> ListType -> b -> (ListType, b)
toList stop l n = case stop n of
True -> (StopTerm, n)
False -> (l, n)
toTermList :: Int -> Int -> (a -> Bool) -> [a] -> [(ListType, a)]
toTermList _ _ _ [] = []
toTermList a b stop ns = trace ("computing toTermList") $
map (toList stop CandidateTerm) xs
<> map (toList stop GraphTerm) ys
<> toTermList a b stop zs
where
xs = take a ns
ys = take b $ drop a ns
zs = drop b $ drop a ns
ta = drop a ns
ys = take b ta
zs = drop b ta
a = 300
b = 350
isStopTerm :: StopSize -> Text -> Bool
isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
......
......@@ -22,7 +22,7 @@ module Gargantext.Text.Metrics
--import Math.KMeans (kmeans, euclidSq, elements)
--import GHC.Float (exp)
import Data.Tuple.Extra (both)
import Data.Map (Map)
import Data.List.Extra (sortOn)
import GHC.Real (round)
......@@ -40,21 +40,22 @@ import qualified Data.Vector.Storable as Vec
type GraphListSize = Int
type InclusionSize = Int
toScored :: Ord t => [Map t (Vec.Vector Double)] -> [Scored t]
toScored = map2scored
{-
toScored' :: Ord t => [Map t (Vec.Vector Double)] -> [Scored t]
toScored' = map2scored
. (pcaReduceTo (Dimension 2))
. (Map.filter (\v -> Vec.length v > 1))
. (Map.unionsWith (<>))
-}
scored :: Ord t => Map (t,t) Int -> [Scored t]
scored = map2scored . (pcaReduceTo (Dimension 2)) . scored2map
where
scored2map :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double)
scored2map m = Map.fromList $ map (\(Scored t i s) -> (t, Vec.fromList [i,s])) $ scored' m
scored2map :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double)
scored2map m = Map.fromList $ map (\(Scored t i s) -> (t, Vec.fromList [i,s])) $ scored' m
map2scored :: Ord t => Map t (Vec.Vector Double) -> [Scored t]
map2scored = map (\(t, ds) -> Scored t (Vec.head ds) (Vec.last ds)) . Map.toList
map2scored :: Ord t => Map t (Vec.Vector Double) -> [Scored t]
map2scored = map (\(t, ds) -> Scored t (Vec.head ds) (Vec.last ds)) . Map.toList
-- TODO change type with (x,y)
data Scored ts = Scored
......@@ -63,8 +64,8 @@ data Scored ts = Scored
, _scored_speGen :: !SpecificityGenericity
} deriving (Show)
localMetrics :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double)
localMetrics m = Map.fromList $ zipWith (\(_,t) (inc,spe) -> (t, Vec.fromList [inc,spe]))
localMetrics' :: Ord t => Map (t,t) Int -> Map t (Vec.Vector Double)
localMetrics' m = Map.fromList $ zipWith (\(_,t) (inc,spe) -> (t, Vec.fromList [inc,spe]))
(Map.toList fi)
scores
where
......@@ -88,8 +89,8 @@ scored' m = zipWith (\(_,t) (inc,spe) -> Scored t (inc) (spe)) (Map.toList fi) s
$ DAA.zip (DAA.use is) (DAA.use ss)
takeScored :: Ord t => GraphListSize -> InclusionSize -> Map (t,t) Int -> [t]
takeScored listSize incSize = map _scored_terms
takeScored :: Ord t => GraphListSize -> InclusionSize -> Map (t,t) Int -> ([t],[t])
takeScored listSize incSize = both (map _scored_terms)
. linearTakes listSize incSize _scored_speGen
_scored_incExc
. scored
......@@ -100,8 +101,8 @@ takeScored listSize incSize = map _scored_terms
-- [(3,8),(6,5)]
linearTakes :: (Ord b1, Ord b2)
=> GraphListSize -> InclusionSize
-> (a -> b2) -> (a -> b1) -> [a] -> [a]
linearTakes gls incSize speGen incExc = take gls
-> (a -> b2) -> (a -> b1) -> [a] -> ([a],[a])
linearTakes gls incSize speGen incExc = (List.splitAt gls)
. List.concat
. map (take $ round
$ (fromIntegral gls :: Double)
......
......@@ -78,8 +78,8 @@ isidoreToDoc l (IsidoreDoc t a d u s as) = do
Nothing
Nothing
(Just $ cleanText $ langText t)
Nothing
(creator2text <$> as)
Nothing
(_sourceName <$> s)
(cleanText <$> langText <$> a)
(fmap (Text.pack . show) utcTime)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment