Commit 3622a5eb authored by Alexandre Delanoë's avatar Alexandre Delanoë

Merge branch 'dev' into dev-phylo

parents 13c81a8c 3b8711b9
......@@ -58,7 +58,7 @@ import GHC.Generics (Generic)
import Gargantext.Core.Utils.Prefix (unPrefix)
import Gargantext.Database.Types.Node (NodeType(..))
import Gargantext.Database.Schema.Node (defaultList, HasNodeError)
import Gargantext.Database.Schema.Ngrams (NgramsType, NgramsTypeId, ngramsTypeId, NgramsTableData'(..))
import Gargantext.Database.Schema.Ngrams (NgramsType, NgramsTypeId, ngramsTypeId, NgramsTableData(..))
import qualified Gargantext.Database.Schema.Ngrams as Ngrams
import Gargantext.Database.Schema.NodeNgram
import Gargantext.Database.Utils (Cmd)
......@@ -120,10 +120,10 @@ newtype NgramsTable = NgramsTable { _ngramsTable :: [NgramsElement] }
deriving (Ord, Eq, Generic, ToJSON, FromJSON, Show)
-- | TODO Check N and Weight
toNgramsElement :: [NgramsTableData'] -> [NgramsElement]
toNgramsElement :: [NgramsTableData] -> [NgramsElement]
toNgramsElement ns = map toNgramsElement' ns
where
toNgramsElement' (NgramsTableData' _ p t _ lt w) = NgramsElement t lt' (round w) p' c'
toNgramsElement' (NgramsTableData _ p t _ lt w) = NgramsElement t lt' (round w) p' c'
where
p' = case p of
Nothing -> Nothing
......@@ -132,14 +132,14 @@ toNgramsElement ns = map toNgramsElement' ns
lt' = maybe (panic "API.Ngrams: listypeId") identity lt
mapParent :: Map Int Text
mapParent = fromListWith (<>) $ map (\(NgramsTableData' i _ t _ _ _) -> (i,t)) ns
mapParent = fromListWith (<>) $ map (\(NgramsTableData i _ t _ _ _) -> (i,t)) ns
mapChildren :: Map Text (Set Text)
mapChildren = mapKeys (\i -> (maybe (panic "API.Ngrams.mapChildren: ParentId with no Terms: Impossible") identity $ lookup i mapParent))
$ fromListWith (<>)
$ map (first fromJust)
$ filter (isJust . fst)
$ map (\(NgramsTableData' _ p t _ _ _) -> (p, Set.singleton t)) ns
$ map (\(NgramsTableData _ p t _ _ _) -> (p, Set.singleton t)) ns
instance Arbitrary NgramsTable where
......
......@@ -18,3 +18,6 @@ module Gargantext.Core.Types.Individu
import Data.Text (Text)
type Username = Text
type UsernameMaster = Username
type UsernameSimple = Username
......@@ -33,7 +33,9 @@ import Gargantext.Core.Types.Main
import Gargantext.Core (Lang(..))
import Gargantext.Database.Config (userMaster, userArbitrary, corpusMasterName)
import Gargantext.Database.Flow.Utils (insertToNodeNgrams)
import Gargantext.Database.Metrics.TFICF (getTficf)
import Gargantext.Text.Terms (extractTerms)
import Gargantext.Text.Metrics.TFICF (Tficf(..))
import Gargantext.Database.Node.Document.Add (add)
import Gargantext.Database.Node.Document.Insert (insertDocuments, ReturnId(..), addUniqIdsDoc, addUniqIdsContact, ToDbData(..))
import Gargantext.Database.Root (getRoot)
......@@ -103,7 +105,7 @@ flowCorpus' :: HasNodeError err
flowCorpus' NodeCorpus hyperdataDocuments (ids,masterUserId,masterCorpusId, userId,userCorpusId) = do
--------------------------------------------------
-- List Ngrams Flow
userListId <- flowListUser userId userCorpusId
userListId <- flowListUser userId userCorpusId 300
printDebug "Working on User ListId : " userListId
let documentsWithId = mergeData (toInserted ids) (toInsert hyperdataDocuments)
......@@ -280,8 +282,15 @@ flowList uId cId ngs = do
pure lId
flowListUser :: HasNodeError err => UserId -> CorpusId -> Cmd err NodeId
flowListUser uId cId = getOrMkList cId uId
flowListUser :: HasNodeError err => UserId -> CorpusId -> Int -> Cmd err NodeId
flowListUser uId cId n = do
lId <- getOrMkList cId uId
-- is <- insertLists lId $ ngrams2list ngs
ngs <- take n <$> sortWith tficf_score <$> getTficf userMaster cId lId NgramsTerms
_ <- insertNodeNgrams [ NodeNgram lId (tficf_ngramsId ng) Nothing (ngramsTypeId NgramsTerms) (fromIntegral $ listTypeId GraphList) 1 | ng <- ngs]
pure lId
------------------------------------------------------------------------
......@@ -319,3 +328,4 @@ insertLists lId lngs = insertNodeNgrams [ NodeNgram lId (_ngramsId ng) Nothing (
| (l,(ngt, ng)) <- lngs
]
------------------------------------------------------------------------
{-|
Module : Gargantext.Database.Metrics.TFICF
Description : Ngram connection to the Database
Copyright : (c) CNRS, 2017-Present
License : AGPL + CECILL v3
Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
TFICF, generalization of TFIDF
-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE QuasiQuotes #-}
module Gargantext.Database.Metrics.TFICF where
import Database.PostgreSQL.Simple.SqlQQ (sql)
import qualified Database.PostgreSQL.Simple as DPS
import Safe (headMay)
import Gargantext.Text.Metrics.TFICF -- (tficf)
import Gargantext.Prelude
import Gargantext.Core.Types.Individu (UsernameMaster)
import Gargantext.Database.Utils (Cmd, runPGSQuery)
import Gargantext.Database.Types.Node (ListId, CorpusId, NodeType(..))
import Gargantext.Database.Config (nodeTypeId)
import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTerms, NgramsType, ngramsTypeId)
type OccGlobal = Double
type OccCorpus = Double
getTficf :: UsernameMaster -> CorpusId -> ListId -> NgramsType
-> Cmd err [Tficf]
getTficf u cId lId ngType = do
g <- getTficfGlobal u
c <- getTficfCorpus cId
ngs <- getTficfNgrams u cId lId ngType
pure $ map (\(nId, nTerms, wm, wn)
-> Tficf nId nTerms
(tficf (TficfCorpus wn (fromIntegral c))
(TficfLanguage wm (fromIntegral g))
)
) ngs
getTficfGlobal :: UsernameMaster -> Cmd err Int
getTficfGlobal u = maybe 0 identity <$> headMay
<$> map (\(DPS.Only n) -> n )
<$> runPGSQuery q p
where
p = (u, nodeTypeId NodeDocument)
q = [sql| SELECT count(*) from nodes n
JOIN auth_user a ON a.id = n.user_id
WHERE
a.username = ?
AND n.typename = ?
|]
getTficfCorpus :: CorpusId -> Cmd err Int
getTficfCorpus cId = maybe 0 identity <$> headMay
<$> map (\(DPS.Only n) -> n )
<$> runPGSQuery q p
where
p = (cId, nodeTypeId NodeDocument)
q = [sql| WITH input(corpusId, typename) AS ((VALUES(?::"int4",?::"int4")))
SELECT count(*) from nodes_nodes AS nn
JOIN nodes AS n ON n.id = nn.node2_id
JOIN input ON nn.node1_id = input.corpusId
WHERE n.typename = input.typename;
|]
getTficfNgrams :: UsernameMaster -> CorpusId -> ListId -> NgramsType
-> Cmd err [(NgramsId, NgramsTerms, OccGlobal, OccCorpus)]
getTficfNgrams u cId lId ngType = runPGSQuery queryTficf p
where
p = (u, nodeTypeId NodeList, nodeTypeId NodeDocument, ngramsTypeId ngType, cId, lId)
queryTficf :: DPS.Query
queryTficf = [sql|
-- TODO add CTE for groups
WITH input(masterUsername,typenameList,typenameDoc,ngramsTypeId,corpusId,listId)
AS ((VALUES(?::"text", ? :: "int4", ?::"int4", ?::"int4",?::"int4",?::"int4"))),
-- AS ((VALUES('gargantua'::"text", 5 :: "int4", 4::"int4", 4::"int4",1018::"int4",1019::"int4"))),
list_master AS (
SELECT n.id,n.name,n.user_id from nodes n
JOIN input ON n.typename = input.typenameList
JOIN auth_user a ON a.id = n.user_id
WHERE
a.username = input.masterUsername
),
ngrams_master AS (
SELECT ng.id, ng.terms, SUM(nng2.weight) AS weight FROM nodes_ngrams nng
JOIN list_master ON list_master.id = nng.node_id
JOIN nodes_ngrams nng2 ON nng2.ngrams_id = nng.ngrams_id
JOIN nodes n ON n.id = nng2.node_id
JOIN input ON input.typenameDoc = n.typename
JOIN ngrams ng ON ng.id = nng2.ngrams_id
WHERE
nng.ngrams_type = input.ngramsTypeId
-- AND n.hyperdata -> 'lang' = 'en'
GROUP BY ng.id,ng.terms
),
ngrams_user AS (
SELECT ng.id, ng.terms, SUM(nng2.weight) AS weight
FROM nodes_ngrams nng
JOIN list_master ON list_master.id = nng.node_id
JOIN nodes_ngrams nng2 ON nng2.ngrams_id = nng.ngrams_id
JOIN nodes_nodes nn ON nn.node2_id = nng2.node_id
JOIN ngrams ng ON ng.id = nng2.ngrams_id
JOIN input ON nn.node1_id = input.corpusId
WHERE
nng.ngrams_type = input.ngramsTypeId
-- AND n.hyperdata -> 'lang' = 'en'
GROUP BY ng.id,ng.terms
)
SELECT nu.id,nu.terms,SUM(nm.weight) wm,SUM(nu.weight) wu
FROM ngrams_user nu
JOIN ngrams_master nm ON nm.id = nu.id
WHERE
nm.weight > 1
AND
nu.weight > 1
GROUP BY nu.id,nu.terms
--ORDER BY wm DESC
--LIMIT 1000
|]
......@@ -53,6 +53,11 @@ import Prelude (Enum, Bounded, minBound, maxBound, Functor)
import qualified Data.Set as DS
import qualified Database.PostgreSQL.Simple as PGS
type NgramsTerms = Text
type NgramsId = Int
type Size = Int
--{-
data NgramsPoly id terms n = NgramsDb { ngrams_id :: id
, ngrams_terms :: terms
......@@ -128,10 +133,6 @@ ngramsTypeId NgramsTerms = 4
fromNgramsTypeId :: NgramsTypeId -> Maybe NgramsType
fromNgramsTypeId id = lookup id $ fromList [(ngramsTypeId nt,nt) | nt <- [minBound .. maxBound] :: [NgramsType]]
type NgramsTerms = Text
type NgramsId = Int
type Size = Int
------------------------------------------------------------------------
-- | TODO put it in Gargantext.Text.Ngrams
data Ngrams = Ngrams { _ngramsTerms :: Text
......@@ -232,7 +233,7 @@ queryInsertNgrams = [sql|
getNgramsTableDb :: NodeType -> NgramsType
-> NgramsTableParamUser
-> Limit -> Offset
-> Cmd err [NgramsTableData']
-> Cmd err [NgramsTableData]
getNgramsTableDb nt ngrt ntp limit_ offset_ = do
......@@ -245,7 +246,7 @@ getNgramsTableDb nt ngrt ntp limit_ offset_ = do
listMasterId <- maybe (panic "error master list") (view node_id) <$> head <$> getListsWithParentId corpusMasterId
getNgramsTableData' nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
getNgramsTableData nt ngrt ntp (NgramsTableParam listMasterId corpusMasterId) limit_ offset_
data NgramsTableParam =
NgramsTableParam { _nt_listId :: NodeId
......@@ -255,44 +256,24 @@ data NgramsTableParam =
type NgramsTableParamUser = NgramsTableParam
type NgramsTableParamMaster = NgramsTableParam
data NgramsTableData = NgramsTableData { _ntd_ngrams :: Text
, _ntd_n :: Int
, _ntd_listType :: Maybe ListType
, _ntd_weight :: Double
} deriving (Show)
getNgramsTableData :: NodeType -> NgramsType
-> NgramsTableParamUser -> NgramsTableParamMaster
-> Limit -> Offset
-> Cmd err [NgramsTableData]
getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
trace ("Ngrams table params" <> show params) <$>
map (\(t,n,nt,w) -> NgramsTableData t n (fromListTypeId nt) w) <$>
runPGSQuery querySelectTableNgrams params
where
nodeTId = nodeTypeId nodeT
ngrmTId = ngramsTypeId ngrmT
params = (ul,uc,nodeTId,ngrmTId,ml,mc,nodeTId,ngrmTId,uc) :.
(limit_, offset_)
data NgramsTableData' = NgramsTableData' { _ntd2_id :: Int
, _ntd2_parent_id :: Maybe Int
, _ntd2_terms :: Text
, _ntd2_n :: Int
, _ntd2_listType :: Maybe ListType
, _ntd2_weight :: Double
data NgramsTableData = NgramsTableData { _ntd_id :: Int
, _ntd_parent_id :: Maybe Int
, _ntd_terms :: Text
, _ntd_n :: Int
, _ntd_listType :: Maybe ListType
, _ntd_weight :: Double
} deriving (Show)
getNgramsTableData' :: NodeType -> NgramsType
getNgramsTableData :: NodeType -> NgramsType
-> NgramsTableParamUser -> NgramsTableParamMaster
-> Limit -> Offset
-> Cmd err [NgramsTableData']
getNgramsTableData' nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
-> Cmd err [NgramsTableData]
getNgramsTableData nodeT ngrmT (NgramsTableParam ul uc) (NgramsTableParam ml mc) limit_ offset_ =
trace ("Ngrams table params: " <> show params) <$>
map (\(i,p,t,n,lt,w) -> NgramsTableData' i p t n (fromListTypeId lt) w) <$>
map (\(i,p,t,n,lt,w) -> NgramsTableData i p t n (fromListTypeId lt) w) <$>
runPGSQuery querySelectTableNgramsTrees params
where
nodeTId = nodeTypeId nodeT
......
......@@ -527,7 +527,7 @@ mkRoot uname uId = case uId > 0 of
mkCorpus :: Maybe Name -> Maybe HyperdataCorpus -> ParentId -> UserId -> Cmd err [CorpusId]
mkCorpus n h p u = insertNodesR [nodeCorpusW n h p u]
getOrMkList :: HasNodeError err => ParentId -> UserId -> Cmd err NodeId
getOrMkList :: HasNodeError err => ParentId -> UserId -> Cmd err ListId
getOrMkList pId uId =
maybe (mkList' pId uId) (pure . view node_id) . headMay =<< getListsWithParentId pId
where
......
......@@ -23,8 +23,10 @@ import GHC.IO (FilePath)
import qualified Data.Text as T
import Data.Text.IO (readFile)
import Data.Map.Strict (Map)
import Data.Maybe (catMaybes)
import qualified Data.Set as DS
import Data.Text (Text)
import qualified Data.Array.Accelerate as A
import qualified Data.Map.Strict as M
......@@ -116,7 +118,12 @@ textFlow' termType contexts = do
let myCooc2 = M.filter (>0) myCooc1
printDebug "myCooc2 size" (M.size myCooc2)
printDebug "myCooc2" myCooc2
g <- cooc2graph myCooc2
pure g
-- TODO use Text only here instead of [Text]
cooc2graph :: (Map ([Text], [Text]) Int) -> IO Graph
cooc2graph myCooc = do
-- Filtering terms with inclusion/Exclusion and Specificity/Genericity scores
let myCooc3 = filterCooc ( FilterConfig (MapListSize 350 )
......@@ -124,7 +131,7 @@ textFlow' termType contexts = do
(SampleBins 10 )
(Clusters 3 )
(DefaultValue 0 )
) myCooc2
) myCooc
printDebug "myCooc3 size" $ M.size myCooc3
printDebug "myCooc3" myCooc3
......@@ -146,7 +153,7 @@ textFlow' termType contexts = do
--let distanceMat = distributional matCooc
printDebug "distanceMat shape" $ A.arrayShape distanceMat
printDebug "distanceMat" distanceMat
--
--let distanceMap = M.filter (>0) $ mat2map distanceMat
let distanceMap = M.map (\_ -> 1) $ M.filter (>0) $ mat2map distanceMat
printDebug "distanceMap size" $ M.size distanceMap
......
......@@ -7,38 +7,43 @@ Maintainer : team@gargantext.org
Stability : experimental
Portability : POSIX
Definition of TFICF
Definition of TFICF : Term Frequency - Inverse of Context Frequency
-}
{-# LANGUAGE DeriveGeneric #-}
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
module Gargantext.Text.Metrics.TFICF where
import GHC.Generics (Generic)
--import Data.Text (Text)
import Gargantext.Prelude
import Gargantext.Database.Schema.Ngrams (NgramsId, NgramsTerms)
import Data.Maybe (Maybe)
import Data.Text (Text)
import Text.Show (Show())
data TficfContext n m = TficfLanguage n m | TficfCorpus n m | TficfDocument n m
deriving (Show)
-- import Gargantext.Types
import Gargantext.Prelude
data Tficf = Tficf { tficf_ngramsId :: NgramsId
, tficf_ngramsTerms :: NgramsTerms
, tficf_score :: Double
} deriving (Show)
data Context = Corpus | Document
deriving (Show, Generic)
type SupraContext = TficfContext
type InfraContext = TficfContext
data TFICF = TFICF { _tficfTerms :: Text
, _tficfContext1 :: Context
, _tficfContext2 :: Context
, _tficfScore :: Maybe Double
} deriving (Show, Generic)
-- | TFICF is a generalization of TFIDF
-- https://en.wikipedia.org/wiki/Tf%E2%80%93idf
tficf :: InfraContext Double Double -> SupraContext Double Double -> Double
tficf (TficfCorpus c c') (TficfLanguage l l') = tficf' c c' l l'
tficf (TficfDocument d d')(TficfCorpus c c') = tficf' d d' c c'
tficf _ _ = panic "Not in definition"
tficf' :: Double -> Double -> Double -> Double -> Double
tficf' c c' l l'
| c <= c' && l < l' = (l/l') / log (c/c')
| otherwise = panic "Frequency impossible"
--tfidf :: Text -> TFICF
--tfidf txt = TFICF txt Document Corpus score
-- where
-- score = Nothing
tficf_example :: [(Double,Double,Double,Double)]
tficf_example = undefined
......@@ -193,7 +193,7 @@ graphV3ToGraph (GraphV3 links nodes) = Graph (map nodeV32node nodes) (zipWith li
nodeV32node :: NodeV3 -> Node
nodeV32node (NodeV3 no_id' (AttributesV3 cl') no_s' no_lb')
= Node no_s' Terms (cs $ show no_id') no_lb' (Attributes cl')
linkV32edge :: Int -> EdgeV3 -> Edge
linkV32edge n (EdgeV3 eo_s' eo_t' eo_w') = Edge (cs $ show eo_s') (cs $ show eo_t') ((T.read $ T.unpack eo_w') :: Double) (cs $ show n)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment