Commit dee88be8 authored by Romain Loth's avatar Romain Loth

first simple version of tfidf in ngram_scores

parent a65df75a
......@@ -9,29 +9,30 @@ LISTTYPES = {
'STOPLIST' : UnweightedList,
'MAINLIST' : UnweightedList,
'MAPLIST' : UnweightedList,
'OCCURRENCES' : WeightedList,
'OCCURRENCES' : WeightedContextIndex,
'COOCCURRENCES': WeightedMatrix,
'TFIDF-CORPUS' : WeightedContextIndex,
}
NODETYPES = [
None,
# documents hierarchy
'USER',
'PROJECT',
'CORPUS',
'DOCUMENT',
'USER', # 1
'PROJECT', # 2
'CORPUS', # 3
'DOCUMENT', # 4
# lists
'STOPLIST',
'GROUPLIST',
'MAINLIST',
'MAPLIST',
'COOCCURRENCES',
'STOPLIST', # 5
'GROUPLIST', # 6
'MAINLIST', # 7
'MAPLIST', # 8
'COOCCURRENCES', # 9
# scores
'OCCURRENCES',
'SPECIFICITY',
'CVALUE',
'TFIDF-CORPUS',
'TFIDF-GLOBAL',
'OCCURRENCES', # 10
'SPECIFICITY', # 11
'CVALUE', # 12
'TFIDF-CORPUS', # 13
'TFIDF-GLOBAL', # 14
]
......
......@@ -2,7 +2,7 @@
"""
__all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList']
__all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList', 'WeightedContextIndex']
from gargantext.util.db import session, bulk_insert
......@@ -165,6 +165,22 @@ class Translations(_BaseClass):
)
class WeightedContextIndex(_BaseClass):
"""
associated model : NodeNodeNgram
associated columns : node1_id | node2_id | ngram_id | score (float)
Tensor representing a contextual index or registry
(matrix of weighted ngrams *per* doc *per* context)
Exemple : tfidf by corpus
"""
def __init__(self, source=None):
self.items = defaultdict(float)
class WeightedMatrix(_BaseClass):
def __init__(self, source=None):
......
from .parsing import parse
from .parsing import parse
from .ngrams_extraction import extract_ngrams
from .ngram_scores import compute_occurrences_local, compute_tfidf_local
from .ngram_groups import compute_groups
from gargantext.util.db import session
from gargantext.models import Node
from .group import compute_groups
from gargantext.models import Node
def parse_extract(corpus):
# retrieve corpus from database from id
......@@ -23,5 +22,15 @@ def parse_extract(corpus):
print('CORPUS #%d: extracted ngrams' % (corpus.id))
# temporary ngram lists workflow
group_id = compute_groups(corpus)
print('CORPUS #%d: new grouplist = #%i' % (corpus.id, group_id))
# write occurrences to Node and NodeNodeNgram
occnd_id = compute_occurrences_local(corpus)
print('CORPUS #%d: new occs node #%i' % (corpus.id, occnd_id))
# write local tfidf to Node and NodeNodeNgram
ltfidf_id = compute_tfidf_local(corpus)
print('CORPUS #%d: new localtfidf node #%i' % (corpus.id, ltfidf_id))
# write groups to Node and NodeNgramNgram
group_id = compute_groups(corpus, stoplist_id = None)
print('CORPUS #%d: new grouplist node #%i' % (corpus.id, group_id))
from gargantext.models import Node, NodeNgram, NodeNodeNgram
from gargantext.util.db import session, bulk_insert
# £TODO
# from gargantext.util.lists import WeightedContextIndex
from gargantext.util.db import func # = sqlalchemy.func like sum() or count()
from math import log
def compute_occurrences_local(corpus):
"""
Calculates sum of occs per ngram within corpus
"""
# 1) all the doc_ids of our corpus (scope of counts for filter)
# slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "DOCUMENT")
.subquery()
)
# 2) our sums per ngram_id
occ_sums = (session
.query(
NodeNgram.ngram_id,
func.sum(NodeNgram.weight)
)
.filter(NodeNgram.node_id.in_(docids_subquery))
.group_by(NodeNgram.ngram_id)
.all()
)
# example result = [(1970, 1.0), (2024, 2.0), (259, 2.0), (302, 1.0), ... ]
# ^^^^ ^^^
# ngram_id sum_wei
# create the new OCCURRENCES node
occnode = Node()
occnode.typename = "OCCURRENCES"
occnode.name = "occ_sums (in:%s)" % corpus.id
occnode.parent_id = corpus.id
occnode.user_id = corpus.user_id
session.add(occnode)
session.commit()
# reflect that in NodeNodeNgrams (could be NodeNgram but harmony with tfidf)
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert(
NodeNodeNgram,
('node1_id' , 'node2_id', 'ngram_id', 'score'),
((occnode.id, corpus.id, res[0], res[1]) for res in occ_sums)
)
return occnode.id
def compute_tfidf_local(corpus):
"""
Calculates tfidf within the current corpus
"""
# ?? FIXME could we keep the docids somehow from previous computations ??
docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "DOCUMENT")
.subquery()
)
total_docs = session.query(docids_subquery).count()
# or perhaps at least do the occurrences right now at the same time
tf_nd = (session
.query(
NodeNgram.ngram_id,
func.sum(NodeNgram.weight), # tf: same as occnode
func.count(NodeNgram.node_id) # nd: n docs with term
)
.filter(NodeNgram.node_id.in_(docids_subquery))
.group_by(NodeNgram.ngram_id)
.all()
)
# ---------------------------------------------
tfidfs = {}
for (ngram_id, tf, nd) in tf_nd:
tfidfs[ngram_id] = tf / log(total_docs/nd)
# ---------------------------------------------
# create the new TFIDF-CORPUS node
ltfidf = Node()
ltfidf.typename = "TFIDF-CORPUS"
ltfidf.name = "tfidf (in:%s)" % corpus.id
ltfidf.parent_id = corpus.id
ltfidf.user_id = corpus.user_id
session.add(ltfidf)
session.commit()
# reflect that in NodeNodeNgrams
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert(
NodeNodeNgram,
('node1_id' , 'node2_id', 'ngram_id', 'score'),
((ltfidf.id, corpus.id, ng, tfidfs[ng]) for ng in tfidfs)
)
return ltfidf.id
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment