Commit eeeec827 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TextFlow] ngrams size (WIP)

parent 80c2bb9e
Pipeline #1073 failed with stage
...@@ -115,7 +115,7 @@ import Data.Set (Set) ...@@ -115,7 +115,7 @@ import Data.Set (Set)
import qualified Data.Set as S import qualified Data.Set as S
import qualified Data.Set as Set import qualified Data.Set as Set
import Data.Swagger hiding (version, patch) import Data.Swagger hiding (version, patch)
import Data.Text (Text, count, isInfixOf, unpack) import Data.Text (Text, isInfixOf, unpack)
import Data.Text.Lazy.IO as DTL import Data.Text.Lazy.IO as DTL
import Data.Validity import Data.Validity
import Database.PostgreSQL.Simple.FromField (FromField, fromField) import Database.PostgreSQL.Simple.FromField (FromField, fromField)
...@@ -144,6 +144,7 @@ import Gargantext.Database.Query.Table.Node.Error (HasNodeError) ...@@ -144,6 +144,7 @@ import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Admin.Types.Node (NodeType(..)) import Gargantext.Database.Admin.Types.Node (NodeType(..))
import Gargantext.Database.Prelude (fromField', HasConnectionPool, HasConfig) import Gargantext.Database.Prelude (fromField', HasConnectionPool, HasConfig)
import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams
import qualified Gargantext.Core.Text as GCT
------------------------------------------------------------------------ ------------------------------------------------------------------------
--data FacetFormat = Table | Chart --data FacetFormat = Table | Chart
...@@ -262,10 +263,7 @@ mkNgramsElement :: NgramsTerm ...@@ -262,10 +263,7 @@ mkNgramsElement :: NgramsTerm
-> MSet NgramsTerm -> MSet NgramsTerm
-> NgramsElement -> NgramsElement
mkNgramsElement ngrams list rp children = mkNgramsElement ngrams list rp children =
NgramsElement ngrams size list 1 (_rp_root <$> rp) (_rp_parent <$> rp) children NgramsElement ngrams (GCT.size ngrams) list 1 (_rp_root <$> rp) (_rp_parent <$> rp) children
where
-- TODO review
size = 1 + count " " ngrams
newNgramsElement :: Maybe ListType -> NgramsTerm -> NgramsElement newNgramsElement :: Maybe ListType -> NgramsTerm -> NgramsElement
newNgramsElement mayList ngrams = newNgramsElement mayList ngrams =
......
...@@ -127,3 +127,9 @@ termTests = "It is hard to detect important articles in a specific context. Info ...@@ -127,3 +127,9 @@ termTests = "It is hard to detect important articles in a specific context. Info
-- group ngrams -- group ngrams
--ocs = occ $ ws --ocs = occ $ ws
-- | Ngrams size
size :: Text -> Int
size t = 1 + DT.count " " t
...@@ -27,6 +27,7 @@ import Gargantext.Core.Text.Metrics.TFICF (sortTficf) ...@@ -27,6 +27,7 @@ import Gargantext.Core.Text.Metrics.TFICF (sortTficf)
import Gargantext.Database.Prelude (Cmd) import Gargantext.Database.Prelude (Cmd)
import Gargantext.Database.Schema.Ngrams (NgramsType(..)) import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Prelude import Gargantext.Prelude
import Gargantext.Core.Text (size)
import Gargantext.Core.Text.List.Learn (Model(..)) import Gargantext.Core.Text.List.Learn (Model(..))
-- import Gargantext.Core.Text.Metrics (takeScored) -- import Gargantext.Core.Text.Metrics (takeScored)
import qualified Data.Char as Char import qualified Data.Char as Char
...@@ -63,7 +64,8 @@ buildNgramsLists :: Lang ...@@ -63,7 +64,8 @@ buildNgramsLists :: Lang
-> Cmd err (Map NgramsType [NgramsElement]) -> Cmd err (Map NgramsType [NgramsElement])
buildNgramsLists l n m s uCid mCid = do buildNgramsLists l n m s uCid mCid = do
ngTerms <- buildNgramsTermsList l n m s uCid mCid ngTerms <- buildNgramsTermsList l n m s uCid mCid
othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes] othersTerms <- mapM (buildNgramsOthersList uCid identity)
[Authors, Sources, Institutes]
pure $ Map.unions $ othersTerms <> [ngTerms] pure $ Map.unions $ othersTerms <> [ngTerms]
...@@ -76,7 +78,9 @@ buildNgramsOthersList uCid groupIt nt = do ...@@ -76,7 +78,9 @@ buildNgramsOthersList uCid groupIt nt = do
let let
listSize = 9 listSize = 9
all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs all' = List.reverse
$ List.sortOn (Set.size . snd . snd)
$ Map.toList ngs
graphTerms = List.take listSize all' graphTerms = List.take listSize all'
candiTerms = List.drop listSize all' candiTerms = List.drop listSize all'
...@@ -104,12 +108,16 @@ buildNgramsTermsList l n m s uCid mCid = do ...@@ -104,12 +108,16 @@ buildNgramsTermsList l n m s uCid mCid = do
-- printDebug "tail candidates" (List.take 10 $ List.reverse $ candidates) -- printDebug "tail candidates" (List.take 10 $ List.reverse $ candidates)
let let
listSize = 400 :: Double
(candidatesHead, candidatesTail0) = List.splitAt 3 candidates (candidatesHead, candidatesTail0) = List.splitAt 3 candidates
(candidatesMap, candidatesTailFinal) = List.splitAt 400 candidatesTail0
(mono, multi) = List.partition (\t -> (size . fst) t < 2) candidatesTail0
(monoHead , monoTail ) = List.splitAt (round $ 0.60 * listSize) mono
(multiHead, multiTail) = List.splitAt (round $ 0.40 * listSize) multi
termList = (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesHead) termList = (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesHead)
<> (map (toGargList ((isStopTerm s) . fst) MapTerm) candidatesMap) <> (map (toGargList ((isStopTerm s) . fst) MapTerm) (monoHead <> multiHead))
<> (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesTailFinal) <> (map (toGargList ((isStopTerm s) . fst) CandidateTerm) (monoTail <> multiTail))
ngs = List.concat ngs = List.concat
$ map toNgramsElement $ map toNgramsElement
......
...@@ -52,12 +52,12 @@ dicoStruct dict_occ = do ...@@ -52,12 +52,12 @@ dicoStruct dict_occ = do
heterogeinity :: [Char] -> IO Integer heterogeinity :: [Char] -> IO Integer
heterogeinity string = do heterogeinity string = do
let dict_occ = occurrences $ cleanText string let dict_occ = occurrences $ cleanText string
let keys_size = toInteger $ length $ M.keys dict_occ let keys_size = toInteger $ length $ M.keys dict_occ
let total_occ = sum $ Prelude.map (\(x, y) -> y) $ M.toList dict_occ let total_occ = sum $ Prelude.map (\(x, y) -> y) $ M.toList dict_occ
return $ div total_occ (fromIntegral keys_size) return $ div total_occ (fromIntegral keys_size)
--computeHeterogeinity --computeHeterogeinity
-- :: Fractional t => -- :: Fractional t =>
......
...@@ -57,6 +57,7 @@ import qualified Data.List as List ...@@ -57,6 +57,7 @@ import qualified Data.List as List
import qualified Data.Set as Set import qualified Data.Set as Set
import qualified Data.Text as Text import qualified Data.Text as Text
data TermType lang data TermType lang
= Mono { _tt_lang :: !lang } = Mono { _tt_lang :: !lang }
| Multi { _tt_lang :: !lang } | Multi { _tt_lang :: !lang }
......
...@@ -466,19 +466,19 @@ incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m) ...@@ -466,19 +466,19 @@ incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
-- | Inclusion (i) = Gen(i)+Spec(i) -- | Inclusion (i) = Gen(i)+Spec(i)
inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double) inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
inclusionExclusion mat = zipWith (+) (pV mat) (pV mat) inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
-- | Genericity score = Gen(i)- Spec(i) -- | Genericity score = Gen(i)- Spec(i)
specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double) specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
specificityGenericity mat = zipWith (+) (pH mat) (pH mat) specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
-- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
pV :: Acc (Matrix Double) -> Acc (Vector Double) pV :: Acc (Matrix Double) -> Acc (Vector Double)
pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
-- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
pH :: Acc (Matrix Double) -> Acc (Vector Double) pH :: Acc (Matrix Double) -> Acc (Vector Double)
pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
cardN :: Exp Double cardN :: Exp Double
cardN = constant (P.fromIntegral (dim m) :: Double) cardN = constant (P.fromIntegral (dim m) :: Double)
......
...@@ -236,7 +236,7 @@ queryNgramsOccurrencesOnlyByNodeUser' = [sql| ...@@ -236,7 +236,7 @@ queryNgramsOccurrencesOnlyByNodeUser' = [sql|
|] |]
------------------------------------------------------------------------ ------------------------------------------------------------------------
getNodesByNgramsOnlyUser :: NodeId getNodesByNgramsOnlyUser :: CorpusId
-> [ListId] -> [ListId]
-> NgramsType -> NgramsType
-> [Text] -> [Text]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment