Commit 75a7e329 authored by delanoe's avatar delanoe

Merge branch 'romain-refactoring' into unstable

parents 6c438c85 eee27166
......@@ -6,15 +6,19 @@ digraph ngramflow {
labelloc="t" ;
"extracted_ngrams" -> "grouplist" ;
"extracted_ngrams" -> "occs+tfidfs" ;
"main_user_stoplist" -> "stoplist" ;
"extracted_ngrams" -> "occs+ti_rank" ;
"project stoplist (todo)" -> "stoplist" ;
"stoplist" -> "mainlist" ;
"occs+tfidfs" -> "mainlist" [label=" TFIDF_LIMIT"];
"occs+ti_rank" -> "mainlist" [label=" TI_RANK_LIMIT"];
"mainlist" -> "coocs" [label=" COOCS_THRESHOLD"] ;
"coocs" -> "specificity" ;
"specificity" -> "maplist" [label="MAPLIST_LIMIT\nMONOGRAM_PART"];
"mainlist" -> "tfidf" ;
"tfidf" -> "explore" [label="doc relations with all map and candidates"];
"maplist" -> "explore" ;
"grouplist" -> "maplist" ;
"grouplist" -> "occs+ti_rank" ;
"grouplist" -> "coocs" ;
"grouplist" -> "tfidf" ;
}
doc/schemas/ngram_parsing_flow.png

52.5 KB | W: | H:

doc/schemas/ngram_parsing_flow.png

75.9 KB | W: | H:

doc/schemas/ngram_parsing_flow.png
doc/schemas/ngram_parsing_flow.png
doc/schemas/ngram_parsing_flow.png
doc/schemas/ngram_parsing_flow.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -196,10 +196,10 @@ class WeightedMatrix(_BaseClass):
self.id = source
from gargantext.models import NodeNgramNgram
query = (session
.query(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id, NodeNgramNgram.score)
.query(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id, NodeNgramNgram.weight)
.filter(NodeNgramNgram.node_id == source)
)
for key1, key2, value in self.items.items():
for key1, key2, value in query.all():
self.items[key1, key2] = value
elif isinstance(source, WeightedMatrix):
for key1, key2, value in source:
......@@ -225,11 +225,14 @@ class WeightedMatrix(_BaseClass):
session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id == node_id).delete()
session.commit()
# insert new data
print("WeightedMatrix bulk_insert start")
bulk_insert(
NodeNgramNgram,
('node_id', 'ngram1_id', 'ngram2_id', 'weight'),
((node_id, key1, key2, value) for key1, key2, value in self)
)
print("WeightedMatrix bulk_insert stop")
def __radd__(self, other):
result = NotImplemented
......
......@@ -6,12 +6,12 @@ from .hyperdata_indexing import index_hyperdata
# in usual run order
from .list_stop import do_stoplist
from .ngram_groups import compute_groups
from .metric_tfidf import compute_occs, compute_tfidf_local, compute_ti_ranking
from .list_main import do_mainlist
from .ngram_coocs import compute_coocs
from .metric_specificity import compute_specificity
from .list_map import do_maplist # TEST
from .ngram_groups import compute_groups
from .mail_notification import notify_owner
from gargantext.util.db import session
from gargantext.models import Node
......@@ -129,27 +129,31 @@ def parse_extract_indexhyperdata(corpus):
print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
# -> write local tfidf similarities to Node and NodeNodeNgram
# TODO only on mainlist
ltfidf_id = compute_tfidf_local(corpus)
ltfidf_id = compute_tfidf_local(corpus,
on_list_id=mainlist_id,
groupings_id = group_id)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# => used for doc <=> ngram association
# ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, groupings_id = group_id)
print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))
# -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)
coocs = compute_coocs(corpus,
on_list_id = mainlist_id,
groupings_id = group_id,
just_pass_result = True)
print('CORPUS #%d: [%s] computed mainlist coocs for specif rank' % (corpus.id, t()))
# -> specificity: compute + write (=> NodeNodeNgram)
spec_id = compute_specificity(corpus, cooc_id=cooc_id
# ,groupings_id = group_id
)
spec_id = compute_specificity(corpus,cooc_matrix = coocs)
# no need here for subforms because cooc already counted them in mainform
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
# ?? maplist: compute + write (to Node and NodeNgram)
# maplist: compute + write (to Node and NodeNgram)
map_id = do_maplist(corpus,
mainlist_id = mainlist_id,
specificity_id=spec_id,
grouplist_id=group_id)
grouplist_id=group_id
)
print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
print('CORPUS #%d: [%s] FINISHED ngram lists computation' % (corpus.id, t()))
......@@ -160,7 +164,7 @@ def parse_extract_indexhyperdata(corpus):
if DEBUG is False:
print('CORPUS #%d: [%s] FINISHED Sendind email notification' % (corpus.id, t()))
print('CORPUS #%d: [%s] FINISHED Sending email notification' % (corpus.id, t()))
notify_owner(corpus)
corpus.status('Workflow', progress=10, complete=True)
......
......@@ -43,15 +43,11 @@ def do_maplist(corpus,
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
mainterms_subquery = (session
# we want only terms within mainlist
.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == mainlist_id)
.subquery()
)
MainlistTable = aliased(NodeNgram)
primary_groupterms_subquery = (session
# we want only primary terms (ngram1)
IsSubform = (session
# we want only secondary terms (ngram2)
# to be able to filter them out
.query(NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == grouplist_id)
.subquery()
......@@ -63,8 +59,15 @@ def do_maplist(corpus,
query = (session.query(ScoreSpec.ngram_id)
.join(Ngram, Ngram.id == ScoreSpec.ngram_id)
.filter(ScoreSpec.node_id == specificity_id)
.filter(ScoreSpec.ngram_id.in_(mainterms_subquery))
.filter(ScoreSpec.ngram_id.notin_(primary_groupterms_subquery))
# we want only terms within mainlist
.join(MainlistTable, Ngram.id == MainlistTable.ngram_id)
.filter(MainlistTable.node_id == mainlist_id)
# we remove all ngrams matching an ngram2_id from the synonyms
.outerjoin(IsSubform,
IsSubform.c.ngram2_id == ScoreSpec.ngram_id)
.filter(IsSubform.c.ngram2_id == None)
)
# TODO: move these 2 pools up to mainlist selection
......@@ -94,7 +97,7 @@ def do_maplist(corpus,
new_hyperdata = { 'corpus': corpus.id,
'limit' : limit,
'monograms_part' : monograms_part,
'monograms_result' : obtained_mono/obtained_total if obtained_total != 0 else obtained_mono
'monograms_result' : obtained_mono/obtained_total if obtained_total != 0 else 0
}
if overwrite_id:
# overwrite pre-existing node
......
......@@ -9,7 +9,7 @@ from collections import defaultdict
from pandas import DataFrame
import pandas as pd
def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
def compute_specificity(corpus, cooc_id=None, cooc_matrix=None, overwrite_id = None):
'''
Compute the specificity, simple calculus.
......@@ -18,17 +18,25 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
- overwrite_id: optional preexisting specificity node to overwrite
'''
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
)
# no filtering: new choice cooc already filtered on tfidf before creation
matrix = defaultdict(lambda : defaultdict(float))
# £TODO re-rename weight => score
for cooccurrence in cooccurrences:
matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
if cooc_id == None and cooc_matrix == None:
raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
elif cooc_id:
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
)
# no filtering: cooc already filtered on mainlist_id at creation
for cooccurrence in cooccurrences:
matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
elif cooc_matrix:
# copy WeightedMatrix into local matrix structure
for (ngram1_id, ngram2_id) in cooc_matrix.items:
w = cooc_matrix.items[(ngram1_id, ngram2_id)]
matrix[ngram1_id][ngram2_id] = w
nb_ngrams = len(matrix)
......
......@@ -377,12 +377,18 @@ def compute_ti_ranking(corpus,
def compute_tfidf_local(corpus, overwrite_id=None):
def compute_tfidf_local(corpus,
on_list_id=None,
groupings_id=None,
overwrite_id=None):
"""
Calculates tfidf similarity of each (doc, ngram) couple, within the current corpus
Parameters:
- the corpus itself
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
- on_list_id: mainlist or maplist type, to constrain the input ngrams
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
......@@ -398,36 +404,94 @@ def compute_tfidf_local(corpus, overwrite_id=None):
# N
total_docs = session.query(docids_subquery).count()
# number of docs with given term (number of rows = M ngrams)
n_docswith_ng = (session
.query(
NodeNgram.ngram_id,
func.count(NodeNgram.node_id).label("nd") # nd: n docs with term
)
.filter(NodeNgram.node_id.in_(docids_subquery))
.group_by(NodeNgram.ngram_id)
.all()
)
# { ngram_id => log(nd) }
log_nd_lookup = {row.ngram_id : log(row.nd) for row in n_docswith_ng}
# define the counted form
if not groupings_id:
ngform_id = NodeNgram.ngram_id
else:
Syno = (session.query(NodeNgramNgram.ngram1_id,
NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == groupings_id)
.subquery()
)
ngform_id = case([
(Syno.c.ngram1_id != None, Syno.c.ngram1_id),
(Syno.c.ngram1_id == None, NodeNgram.ngram_id)
])
# tf for each couple (number of rows = N docs X M ngrams)
tf_doc_ng = (session
tf_doc_query = (session
.query(
NodeNgram.ngram_id,
ngform_id,
NodeNgram.node_id,
func.sum(NodeNgram.weight).label("tf"), # tf: occurrences
)
.filter(NodeNgram.node_id.in_(docids_subquery))
.group_by(NodeNgram.node_id, NodeNgram.ngram_id)
.all()
# select within docs of current corpus
.join(docids_subquery,
docids_subquery.c.id == NodeNgram.node_id)
)
if groupings_id:
tf_doc_query = ( tf_doc_query
.outerjoin(Syno, Syno.c.ngram2_id == NodeNgram.ngram_id)
)
# now when we'll group_by the ngram2 freqs will be added to ngram1
if on_list_id:
Miamlist = aliased(NodeNgram)
tf_doc_query = ( tf_doc_query
.join(Miamlist, Miamlist.ngram_id == ngform_id)
.filter( Miamlist.node_id == on_list_id )
)
# execute query to do our tf sum
tf_per_doc = tf_doc_query.group_by(NodeNgram.node_id, ngform_id).all()
# ex: [(128371, 9732, 1.0),
# (128383, 9740, 1.0),
# (128373, 9731, 1.0),
# (128376, 9734, 1.0),
# (128372, 9731, 1.0),
# (128383, 9733, 1.0),
# (128383, 9735, 1.0),
# (128389, 9734, 1.0),
# (8624, 9731, 1.0),
# (128382, 9740, 1.0),
# (128383, 9739, 1.0),
# (128383, 9736, 1.0),
# (128378, 9735, 1.0),
# (128375, 9733, 4.0),
# (128383, 9732, 1.0)]
# ^ ^ ^^ ^^
# ngram doc freq in this doc
# simultaneously count docs with given term (number of rows = M ngrams)
ndocswithngram = {}
for triple in tf_per_doc:
ng = triple[0]
doc = triple[1]
if ng in ndocswithngram:
ndocswithngram[ng] += 1
else:
ndocswithngram[ng] = 1
# print(ndocswithngram)
# store for use in formula
# { ngram_id => log(nd) }
log_nd_lookup = {ng : log(nd_count)
for (ng, nd_count) in ndocswithngram.items()}
# ---------------------------------------------------------
tfidfs = {}
log_tot_docs = log(total_docs)
for (ngram_id, node_id, tf) in tf_doc_ng:
for (ngram_id, node_id, tf) in tf_per_doc:
log_nd = log_nd_lookup[ngram_id]
# tfidfs[ngram_id] = tf * log(total_docs/nd)
tfidfs[node_id, ngram_id] = tf * (log_tot_docs-log_nd)
......
......@@ -10,13 +10,15 @@ from sqlalchemy.sql.expression import case # for choice if ngram has mainform or
def compute_coocs( corpus,
overwrite_id = None,
just_pass_result= True, # just return the WeightedMatrix,
# (don't write to DB)
threshold = DEFAULT_COOC_THRESHOLD,
groupings_id = None,
mainlist_id = None,
on_list_id = None,
stoplist_id = None,
start = None,
end = None,
symmetry_filter = True):
symmetry_filter = False):
"""
Count how often some extracted terms appear
together in a small context (document)
......@@ -46,7 +48,7 @@ def compute_coocs( corpus,
- threshold: on output cooc count (previously called hapax)
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
- mainlist_id: mainlist to constrain the input ngrams
- on_list_id: mainlist or maplist type, to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is already provided)
- start, end: provide one or both temporal limits to filter on doc date
......@@ -62,9 +64,10 @@ def compute_coocs( corpus,
======================
each pair of ngrams sharing same doc (node_id)
SELEC idxa.ngram_id, idxb.ngram_id
FROM nodes_ngrams AS idxa, nodes_ngrams AS idxb
FROM nodes_ngrams AS idxa
---------------------------------
WHERE idxa.node_id = idxb.node_id <== that's cooc
JOIN nodes_ngrams AS idxb
ON idxa.node_id = idxb.node_id <== that's cooc
---------------------------------
AND idxa.ngram_id <> idxb.ngram_id
AND idxa.node_id = MY_DOC ;
......@@ -188,7 +191,7 @@ def compute_coocs( corpus,
# 4) INPUT FILTERS (reduce N before O(N²))
if mainlist_id:
if on_list_id:
m1 = aliased(NodeNgram)
m2 = aliased(NodeNgram)
......@@ -197,8 +200,8 @@ def compute_coocs( corpus,
.join(m1, m1.ngram_id == Xindex_ngform_id)
.join(m2, m2.ngram_id == Yindex_ngform_id)
.filter( m1.node_id == mainlist_id )
.filter( m2.node_id == mainlist_id )
.filter( m1.node_id == on_list_id )
.filter( m2.node_id == on_list_id )
)
if stoplist_id:
......@@ -279,31 +282,36 @@ def compute_coocs( corpus,
shape_1 = len({pair[1] for pair in matrix.items})
print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
# 5) SAVE
# --------
# saving the parameters of the analysis in the Node JSON
new_hyperdata = { 'corpus' : corpus.id,
'threshold': threshold }
if overwrite_id:
# overwrite pre-existing id
the_cooc = cache.Node[overwrite_id]
the_cooc.hyperdata = new_hyperdata
the_cooc.save_hyperdata()
session.commit()
the_id = overwrite_id
else:
# create the new cooc node
the_cooc = corpus.add_child(
typename = "COOCCURRENCES",
name = "Coocs (in:%s)" % corpus.name[0:10],
hyperdata = new_hyperdata,
)
session.add(the_cooc)
session.commit()
the_id = the_cooc.id
# ==> save all NodeNgramNgram with link to new cooc node id
matrix.save(the_id)
return the_id
if just_pass_result:
return matrix
else:
# 5) SAVE
# --------
# saving the parameters of the analysis in the Node JSON
new_hyperdata = { 'corpus' : corpus.id,
'threshold': threshold }
if overwrite_id:
# overwrite pre-existing id
the_cooc = cache.Node[overwrite_id]
the_cooc.hyperdata = new_hyperdata
the_cooc.save_hyperdata()
session.commit()
the_id = overwrite_id
else:
# create the new cooc node
the_cooc = corpus.add_child(
typename = "COOCCURRENCES",
name = "Coocs (in:%s)" % corpus.name[0:10],
hyperdata = new_hyperdata,
)
session.add(the_cooc)
session.commit()
the_id = the_cooc.id
# ==> save all NodeNgramNgram with link to new cooc node id
matrix.save(the_id)
return the_id
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment