Commit fee236bc authored by Romain Loth's avatar Romain Loth

first simple version of tfidf in ngram_scores

parent 32495844
...@@ -9,29 +9,30 @@ LISTTYPES = { ...@@ -9,29 +9,30 @@ LISTTYPES = {
'STOPLIST' : UnweightedList, 'STOPLIST' : UnweightedList,
'MAINLIST' : UnweightedList, 'MAINLIST' : UnweightedList,
'MAPLIST' : UnweightedList, 'MAPLIST' : UnweightedList,
'OCCURRENCES' : WeightedList, 'OCCURRENCES' : WeightedContextIndex,
'COOCCURRENCES': WeightedMatrix, 'COOCCURRENCES': WeightedMatrix,
'TFIDF-CORPUS' : WeightedContextIndex,
} }
NODETYPES = [ NODETYPES = [
None, None,
# documents hierarchy # documents hierarchy
'USER', 'USER', # 1
'PROJECT', 'PROJECT', # 2
'CORPUS', 'CORPUS', # 3
'DOCUMENT', 'DOCUMENT', # 4
# lists # lists
'STOPLIST', 'STOPLIST', # 5
'GROUPLIST', 'GROUPLIST', # 6
'MAINLIST', 'MAINLIST', # 7
'MAPLIST', 'MAPLIST', # 8
'COOCCURRENCES', 'COOCCURRENCES', # 9
# scores # scores
'OCCURRENCES', 'OCCURRENCES', # 10
'SPECIFICITY', 'SPECIFICITY', # 11
'CVALUE', 'CVALUE', # 12
'TFIDF-CORPUS', 'TFIDF-CORPUS', # 13
'TFIDF-GLOBAL', 'TFIDF-GLOBAL', # 14
] ]
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
""" """
__all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList'] __all__ = ['Translations', 'WeightedMatrix', 'UnweightedList', 'WeightedList', 'WeightedContextIndex']
from gargantext.util.db import session, bulk_insert from gargantext.util.db import session, bulk_insert
...@@ -165,6 +165,22 @@ class Translations(_BaseClass): ...@@ -165,6 +165,22 @@ class Translations(_BaseClass):
) )
class WeightedContextIndex(_BaseClass):
"""
associated model : NodeNodeNgram
associated columns : node1_id | node2_id | ngram_id | score (float)
Tensor representing a contextual index or registry
(matrix of weighted ngrams *per* doc *per* context)
Exemple : tfidf by corpus
"""
def __init__(self, source=None):
self.items = defaultdict(float)
class WeightedMatrix(_BaseClass): class WeightedMatrix(_BaseClass):
def __init__(self, source=None): def __init__(self, source=None):
......
from .parsing import parse from .parsing import parse
from .ngrams_extraction import extract_ngrams from .ngrams_extraction import extract_ngrams
from .ngram_scores import compute_occurrences_local, compute_tfidf_local
from .ngram_groups import compute_groups
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.models import Node from gargantext.models import Node
from .group import compute_groups
def parse_extract(corpus): def parse_extract(corpus):
# retrieve corpus from database from id # retrieve corpus from database from id
...@@ -23,5 +22,15 @@ def parse_extract(corpus): ...@@ -23,5 +22,15 @@ def parse_extract(corpus):
print('CORPUS #%d: extracted ngrams' % (corpus.id)) print('CORPUS #%d: extracted ngrams' % (corpus.id))
# temporary ngram lists workflow # temporary ngram lists workflow
group_id = compute_groups(corpus)
print('CORPUS #%d: new grouplist = #%i' % (corpus.id, group_id)) # write occurrences to Node and NodeNodeNgram
occnd_id = compute_occurrences_local(corpus)
print('CORPUS #%d: new occs node #%i' % (corpus.id, occnd_id))
# write local tfidf to Node and NodeNodeNgram
ltfidf_id = compute_tfidf_local(corpus)
print('CORPUS #%d: new localtfidf node #%i' % (corpus.id, ltfidf_id))
# write groups to Node and NodeNgramNgram
group_id = compute_groups(corpus, stoplist_id = None)
print('CORPUS #%d: new grouplist node #%i' % (corpus.id, group_id))
from gargantext.models import Node, NodeNgram, NodeNodeNgram
from gargantext.util.db import session, bulk_insert
# £TODO
# from gargantext.util.lists import WeightedContextIndex
from gargantext.util.db import func # = sqlalchemy.func like sum() or count()
from math import log
def compute_occurrences_local(corpus):
"""
Calculates sum of occs per ngram within corpus
"""
# 1) all the doc_ids of our corpus (scope of counts for filter)
# slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "DOCUMENT")
.subquery()
)
# 2) our sums per ngram_id
occ_sums = (session
.query(
NodeNgram.ngram_id,
func.sum(NodeNgram.weight)
)
.filter(NodeNgram.node_id.in_(docids_subquery))
.group_by(NodeNgram.ngram_id)
.all()
)
# example result = [(1970, 1.0), (2024, 2.0), (259, 2.0), (302, 1.0), ... ]
# ^^^^ ^^^
# ngram_id sum_wei
# create the new OCCURRENCES node
occnode = Node()
occnode.typename = "OCCURRENCES"
occnode.name = "occ_sums (in:%s)" % corpus.id
occnode.parent_id = corpus.id
occnode.user_id = corpus.user_id
session.add(occnode)
session.commit()
# reflect that in NodeNodeNgrams (could be NodeNgram but harmony with tfidf)
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert(
NodeNodeNgram,
('node1_id' , 'node2_id', 'ngram_id', 'score'),
((occnode.id, corpus.id, res[0], res[1]) for res in occ_sums)
)
return occnode.id
def compute_tfidf_local(corpus):
"""
Calculates tfidf within the current corpus
"""
# ?? FIXME could we keep the docids somehow from previous computations ??
docids_subquery = (session
.query(Node.id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == "DOCUMENT")
.subquery()
)
total_docs = session.query(docids_subquery).count()
# or perhaps at least do the occurrences right now at the same time
tf_nd = (session
.query(
NodeNgram.ngram_id,
func.sum(NodeNgram.weight), # tf: same as occnode
func.count(NodeNgram.node_id) # nd: n docs with term
)
.filter(NodeNgram.node_id.in_(docids_subquery))
.group_by(NodeNgram.ngram_id)
.all()
)
# ---------------------------------------------
tfidfs = {}
for (ngram_id, tf, nd) in tf_nd:
tfidfs[ngram_id] = tf / log(total_docs/nd)
# ---------------------------------------------
# create the new TFIDF-CORPUS node
ltfidf = Node()
ltfidf.typename = "TFIDF-CORPUS"
ltfidf.name = "tfidf (in:%s)" % corpus.id
ltfidf.parent_id = corpus.id
ltfidf.user_id = corpus.user_id
session.add(ltfidf)
session.commit()
# reflect that in NodeNodeNgrams
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert(
NodeNodeNgram,
('node1_id' , 'node2_id', 'ngram_id', 'score'),
((ltfidf.id, corpus.id, ng, tfidfs[ng]) for ng in tfidfs)
)
return ltfidf.id
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment