Commit eeeec827 authored by Alexandre Delanoë's avatar Alexandre Delanoë

[TextFlow] ngrams size (WIP)

parent 80c2bb9e
Pipeline #1073 failed with stage
......@@ -115,7 +115,7 @@ import Data.Set (Set)
import qualified Data.Set as S
import qualified Data.Set as Set
import Data.Swagger hiding (version, patch)
import Data.Text (Text, count, isInfixOf, unpack)
import Data.Text (Text, isInfixOf, unpack)
import Data.Text.Lazy.IO as DTL
import Data.Validity
import Database.PostgreSQL.Simple.FromField (FromField, fromField)
......@@ -144,6 +144,7 @@ import Gargantext.Database.Query.Table.Node.Error (HasNodeError)
import Gargantext.Database.Admin.Types.Node (NodeType(..))
import Gargantext.Database.Prelude (fromField', HasConnectionPool, HasConfig)
import qualified Gargantext.Database.Query.Table.Ngrams as TableNgrams
import qualified Gargantext.Core.Text as GCT
------------------------------------------------------------------------
--data FacetFormat = Table | Chart
......@@ -262,10 +263,7 @@ mkNgramsElement :: NgramsTerm
-> MSet NgramsTerm
-> NgramsElement
mkNgramsElement ngrams list rp children =
NgramsElement ngrams size list 1 (_rp_root <$> rp) (_rp_parent <$> rp) children
where
-- TODO review
size = 1 + count " " ngrams
NgramsElement ngrams (GCT.size ngrams) list 1 (_rp_root <$> rp) (_rp_parent <$> rp) children
newNgramsElement :: Maybe ListType -> NgramsTerm -> NgramsElement
newNgramsElement mayList ngrams =
......
......@@ -127,3 +127,9 @@ termTests = "It is hard to detect important articles in a specific context. Info
-- group ngrams
--ocs = occ $ ws
-- | Ngrams size
size :: Text -> Int
size t = 1 + DT.count " " t
......@@ -27,6 +27,7 @@ import Gargantext.Core.Text.Metrics.TFICF (sortTficf)
import Gargantext.Database.Prelude (Cmd)
import Gargantext.Database.Schema.Ngrams (NgramsType(..))
import Gargantext.Prelude
import Gargantext.Core.Text (size)
import Gargantext.Core.Text.List.Learn (Model(..))
-- import Gargantext.Core.Text.Metrics (takeScored)
import qualified Data.Char as Char
......@@ -63,7 +64,8 @@ buildNgramsLists :: Lang
-> Cmd err (Map NgramsType [NgramsElement])
buildNgramsLists l n m s uCid mCid = do
ngTerms <- buildNgramsTermsList l n m s uCid mCid
othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
othersTerms <- mapM (buildNgramsOthersList uCid identity)
[Authors, Sources, Institutes]
pure $ Map.unions $ othersTerms <> [ngTerms]
......@@ -76,7 +78,9 @@ buildNgramsOthersList uCid groupIt nt = do
let
listSize = 9
all' = List.reverse $ List.sortOn (Set.size . snd . snd) $ Map.toList ngs
all' = List.reverse
$ List.sortOn (Set.size . snd . snd)
$ Map.toList ngs
graphTerms = List.take listSize all'
candiTerms = List.drop listSize all'
......@@ -104,12 +108,16 @@ buildNgramsTermsList l n m s uCid mCid = do
-- printDebug "tail candidates" (List.take 10 $ List.reverse $ candidates)
let
listSize = 400 :: Double
(candidatesHead, candidatesTail0) = List.splitAt 3 candidates
(candidatesMap, candidatesTailFinal) = List.splitAt 400 candidatesTail0
(mono, multi) = List.partition (\t -> (size . fst) t < 2) candidatesTail0
(monoHead , monoTail ) = List.splitAt (round $ 0.60 * listSize) mono
(multiHead, multiTail) = List.splitAt (round $ 0.40 * listSize) multi
termList = (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesHead)
<> (map (toGargList ((isStopTerm s) . fst) MapTerm) candidatesMap)
<> (map (toGargList ((isStopTerm s) . fst) CandidateTerm) candidatesTailFinal)
<> (map (toGargList ((isStopTerm s) . fst) MapTerm) (monoHead <> multiHead))
<> (map (toGargList ((isStopTerm s) . fst) CandidateTerm) (monoTail <> multiTail))
ngs = List.concat
$ map toNgramsElement
......
......@@ -52,12 +52,12 @@ dicoStruct dict_occ = do
heterogeinity :: [Char] -> IO Integer
heterogeinity string = do
let dict_occ = occurrences $ cleanText string
let keys_size = toInteger $ length $ M.keys dict_occ
let total_occ = sum $ Prelude.map (\(x, y) -> y) $ M.toList dict_occ
return $ div total_occ (fromIntegral keys_size)
--computeHeterogeinity
-- :: Fractional t =>
......
......@@ -57,6 +57,7 @@ import qualified Data.List as List
import qualified Data.Set as Set
import qualified Data.Text as Text
data TermType lang
= Mono { _tt_lang :: !lang }
| Multi { _tt_lang :: !lang }
......
......@@ -466,19 +466,19 @@ incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
-- | Inclusion (i) = Gen(i)+Spec(i)
inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
-- | Genericity score = Gen(i)- Spec(i)
specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
-- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
pV :: Acc (Matrix Double) -> Acc (Vector Double)
pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
-- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
pH :: Acc (Matrix Double) -> Acc (Vector Double)
pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
cardN :: Exp Double
cardN = constant (P.fromIntegral (dim m) :: Double)
......
......@@ -236,7 +236,7 @@ queryNgramsOccurrencesOnlyByNodeUser' = [sql|
|]
------------------------------------------------------------------------
getNodesByNgramsOnlyUser :: NodeId
getNodesByNgramsOnlyUser :: CorpusId
-> [ListId]
-> NgramsType
-> [Text]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment