Commit 75a7e329 authored by delanoe's avatar delanoe

Merge branch 'romain-refactoring' into unstable

parents 6c438c85 eee27166
...@@ -6,15 +6,19 @@ digraph ngramflow { ...@@ -6,15 +6,19 @@ digraph ngramflow {
labelloc="t" ; labelloc="t" ;
"extracted_ngrams" -> "grouplist" ; "extracted_ngrams" -> "grouplist" ;
"extracted_ngrams" -> "occs+tfidfs" ; "extracted_ngrams" -> "occs+ti_rank" ;
"main_user_stoplist" -> "stoplist" ; "project stoplist (todo)" -> "stoplist" ;
"stoplist" -> "mainlist" ; "stoplist" -> "mainlist" ;
"occs+tfidfs" -> "mainlist" [label=" TFIDF_LIMIT"]; "occs+ti_rank" -> "mainlist" [label=" TI_RANK_LIMIT"];
"mainlist" -> "coocs" [label=" COOCS_THRESHOLD"] ; "mainlist" -> "coocs" [label=" COOCS_THRESHOLD"] ;
"coocs" -> "specificity" ; "coocs" -> "specificity" ;
"specificity" -> "maplist" [label="MAPLIST_LIMIT\nMONOGRAM_PART"]; "specificity" -> "maplist" [label="MAPLIST_LIMIT\nMONOGRAM_PART"];
"mainlist" -> "tfidf" ;
"tfidf" -> "explore" [label="doc relations with all map and candidates"];
"maplist" -> "explore" ; "maplist" -> "explore" ;
"grouplist" -> "maplist" ; "grouplist" -> "occs+ti_rank" ;
"grouplist" -> "coocs" ;
"grouplist" -> "tfidf" ;
} }
doc/schemas/ngram_parsing_flow.png

52.5 KB | W: | H:

doc/schemas/ngram_parsing_flow.png

75.9 KB | W: | H:

doc/schemas/ngram_parsing_flow.png
doc/schemas/ngram_parsing_flow.png
doc/schemas/ngram_parsing_flow.png
doc/schemas/ngram_parsing_flow.png
  • 2-up
  • Swipe
  • Onion skin
...@@ -196,10 +196,10 @@ class WeightedMatrix(_BaseClass): ...@@ -196,10 +196,10 @@ class WeightedMatrix(_BaseClass):
self.id = source self.id = source
from gargantext.models import NodeNgramNgram from gargantext.models import NodeNgramNgram
query = (session query = (session
.query(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id, NodeNgramNgram.score) .query(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id, NodeNgramNgram.weight)
.filter(NodeNgramNgram.node_id == source) .filter(NodeNgramNgram.node_id == source)
) )
for key1, key2, value in self.items.items(): for key1, key2, value in query.all():
self.items[key1, key2] = value self.items[key1, key2] = value
elif isinstance(source, WeightedMatrix): elif isinstance(source, WeightedMatrix):
for key1, key2, value in source: for key1, key2, value in source:
...@@ -225,11 +225,14 @@ class WeightedMatrix(_BaseClass): ...@@ -225,11 +225,14 @@ class WeightedMatrix(_BaseClass):
session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id == node_id).delete() session.query(NodeNgramNgram).filter(NodeNgramNgram.node_id == node_id).delete()
session.commit() session.commit()
# insert new data # insert new data
print("WeightedMatrix bulk_insert start")
bulk_insert( bulk_insert(
NodeNgramNgram, NodeNgramNgram,
('node_id', 'ngram1_id', 'ngram2_id', 'weight'), ('node_id', 'ngram1_id', 'ngram2_id', 'weight'),
((node_id, key1, key2, value) for key1, key2, value in self) ((node_id, key1, key2, value) for key1, key2, value in self)
) )
print("WeightedMatrix bulk_insert stop")
def __radd__(self, other): def __radd__(self, other):
result = NotImplemented result = NotImplemented
......
...@@ -6,12 +6,12 @@ from .hyperdata_indexing import index_hyperdata ...@@ -6,12 +6,12 @@ from .hyperdata_indexing import index_hyperdata
# in usual run order # in usual run order
from .list_stop import do_stoplist from .list_stop import do_stoplist
from .ngram_groups import compute_groups
from .metric_tfidf import compute_occs, compute_tfidf_local, compute_ti_ranking from .metric_tfidf import compute_occs, compute_tfidf_local, compute_ti_ranking
from .list_main import do_mainlist from .list_main import do_mainlist
from .ngram_coocs import compute_coocs from .ngram_coocs import compute_coocs
from .metric_specificity import compute_specificity from .metric_specificity import compute_specificity
from .list_map import do_maplist # TEST from .list_map import do_maplist # TEST
from .ngram_groups import compute_groups
from .mail_notification import notify_owner from .mail_notification import notify_owner
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.models import Node from gargantext.models import Node
...@@ -129,27 +129,31 @@ def parse_extract_indexhyperdata(corpus): ...@@ -129,27 +129,31 @@ def parse_extract_indexhyperdata(corpus):
print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id)) print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
# -> write local tfidf similarities to Node and NodeNodeNgram # -> write local tfidf similarities to Node and NodeNodeNgram
# TODO only on mainlist ltfidf_id = compute_tfidf_local(corpus,
ltfidf_id = compute_tfidf_local(corpus) on_list_id=mainlist_id,
groupings_id = group_id)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id)) print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# => used for doc <=> ngram association # => used for doc <=> ngram association
# ------------ # ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram) # -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, groupings_id = group_id) coocs = compute_coocs(corpus,
print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id)) on_list_id = mainlist_id,
groupings_id = group_id,
just_pass_result = True)
print('CORPUS #%d: [%s] computed mainlist coocs for specif rank' % (corpus.id, t()))
# -> specificity: compute + write (=> NodeNodeNgram) # -> specificity: compute + write (=> NodeNodeNgram)
spec_id = compute_specificity(corpus, cooc_id=cooc_id spec_id = compute_specificity(corpus,cooc_matrix = coocs)
# ,groupings_id = group_id # no need here for subforms because cooc already counted them in mainform
)
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id)) print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
# ?? maplist: compute + write (to Node and NodeNgram) # maplist: compute + write (to Node and NodeNgram)
map_id = do_maplist(corpus, map_id = do_maplist(corpus,
mainlist_id = mainlist_id, mainlist_id = mainlist_id,
specificity_id=spec_id, specificity_id=spec_id,
grouplist_id=group_id) grouplist_id=group_id
)
print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id)) print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
print('CORPUS #%d: [%s] FINISHED ngram lists computation' % (corpus.id, t())) print('CORPUS #%d: [%s] FINISHED ngram lists computation' % (corpus.id, t()))
...@@ -160,7 +164,7 @@ def parse_extract_indexhyperdata(corpus): ...@@ -160,7 +164,7 @@ def parse_extract_indexhyperdata(corpus):
if DEBUG is False: if DEBUG is False:
print('CORPUS #%d: [%s] FINISHED Sendind email notification' % (corpus.id, t())) print('CORPUS #%d: [%s] FINISHED Sending email notification' % (corpus.id, t()))
notify_owner(corpus) notify_owner(corpus)
corpus.status('Workflow', progress=10, complete=True) corpus.status('Workflow', progress=10, complete=True)
......
...@@ -43,15 +43,11 @@ def do_maplist(corpus, ...@@ -43,15 +43,11 @@ def do_maplist(corpus,
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id) #dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
mainterms_subquery = (session MainlistTable = aliased(NodeNgram)
# we want only terms within mainlist
.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == mainlist_id)
.subquery()
)
primary_groupterms_subquery = (session IsSubform = (session
# we want only primary terms (ngram1) # we want only secondary terms (ngram2)
# to be able to filter them out
.query(NodeNgramNgram.ngram2_id) .query(NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == grouplist_id) .filter(NodeNgramNgram.node_id == grouplist_id)
.subquery() .subquery()
...@@ -63,8 +59,15 @@ def do_maplist(corpus, ...@@ -63,8 +59,15 @@ def do_maplist(corpus,
query = (session.query(ScoreSpec.ngram_id) query = (session.query(ScoreSpec.ngram_id)
.join(Ngram, Ngram.id == ScoreSpec.ngram_id) .join(Ngram, Ngram.id == ScoreSpec.ngram_id)
.filter(ScoreSpec.node_id == specificity_id) .filter(ScoreSpec.node_id == specificity_id)
.filter(ScoreSpec.ngram_id.in_(mainterms_subquery))
.filter(ScoreSpec.ngram_id.notin_(primary_groupterms_subquery)) # we want only terms within mainlist
.join(MainlistTable, Ngram.id == MainlistTable.ngram_id)
.filter(MainlistTable.node_id == mainlist_id)
# we remove all ngrams matching an ngram2_id from the synonyms
.outerjoin(IsSubform,
IsSubform.c.ngram2_id == ScoreSpec.ngram_id)
.filter(IsSubform.c.ngram2_id == None)
) )
# TODO: move these 2 pools up to mainlist selection # TODO: move these 2 pools up to mainlist selection
...@@ -94,7 +97,7 @@ def do_maplist(corpus, ...@@ -94,7 +97,7 @@ def do_maplist(corpus,
new_hyperdata = { 'corpus': corpus.id, new_hyperdata = { 'corpus': corpus.id,
'limit' : limit, 'limit' : limit,
'monograms_part' : monograms_part, 'monograms_part' : monograms_part,
'monograms_result' : obtained_mono/obtained_total if obtained_total != 0 else obtained_mono 'monograms_result' : obtained_mono/obtained_total if obtained_total != 0 else 0
} }
if overwrite_id: if overwrite_id:
# overwrite pre-existing node # overwrite pre-existing node
......
...@@ -9,7 +9,7 @@ from collections import defaultdict ...@@ -9,7 +9,7 @@ from collections import defaultdict
from pandas import DataFrame from pandas import DataFrame
import pandas as pd import pandas as pd
def compute_specificity(corpus, cooc_id=None, overwrite_id = None): def compute_specificity(corpus, cooc_id=None, cooc_matrix=None, overwrite_id = None):
''' '''
Compute the specificity, simple calculus. Compute the specificity, simple calculus.
...@@ -18,18 +18,26 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None): ...@@ -18,18 +18,26 @@ def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
- overwrite_id: optional preexisting specificity node to overwrite - overwrite_id: optional preexisting specificity node to overwrite
''' '''
matrix = defaultdict(lambda : defaultdict(float))
if cooc_id == None and cooc_matrix == None:
raise TypeError("compute_specificity: needs a cooc_id or cooc_matrix param")
elif cooc_id:
cooccurrences = (session.query(NodeNgramNgram) cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id) .filter(NodeNgramNgram.node_id==cooc_id)
) )
# no filtering: new choice cooc already filtered on tfidf before creation # no filtering: cooc already filtered on mainlist_id at creation
matrix = defaultdict(lambda : defaultdict(float))
# £TODO re-rename weight => score
for cooccurrence in cooccurrences: for cooccurrence in cooccurrences:
matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight matrix[cooccurrence.ngram1_id][cooccurrence.ngram2_id] = cooccurrence.weight
matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight matrix[cooccurrence.ngram2_id][cooccurrence.ngram1_id] = cooccurrence.weight
elif cooc_matrix:
# copy WeightedMatrix into local matrix structure
for (ngram1_id, ngram2_id) in cooc_matrix.items:
w = cooc_matrix.items[(ngram1_id, ngram2_id)]
matrix[ngram1_id][ngram2_id] = w
nb_ngrams = len(matrix) nb_ngrams = len(matrix)
print("SPECIFICITY: computing on %i ngrams" % nb_ngrams) print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
......
...@@ -377,12 +377,18 @@ def compute_ti_ranking(corpus, ...@@ -377,12 +377,18 @@ def compute_ti_ranking(corpus,
def compute_tfidf_local(corpus, overwrite_id=None): def compute_tfidf_local(corpus,
on_list_id=None,
groupings_id=None,
overwrite_id=None):
""" """
Calculates tfidf similarity of each (doc, ngram) couple, within the current corpus Calculates tfidf similarity of each (doc, ngram) couple, within the current corpus
Parameters: Parameters:
- the corpus itself - the corpus itself
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
- on_list_id: mainlist or maplist type, to constrain the input ngrams
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced) (the Node and its previous NodeNodeNgram rows will be replaced)
""" """
...@@ -398,36 +404,94 @@ def compute_tfidf_local(corpus, overwrite_id=None): ...@@ -398,36 +404,94 @@ def compute_tfidf_local(corpus, overwrite_id=None):
# N # N
total_docs = session.query(docids_subquery).count() total_docs = session.query(docids_subquery).count()
# number of docs with given term (number of rows = M ngrams)
n_docswith_ng = (session # define the counted form
.query( if not groupings_id:
NodeNgram.ngram_id, ngform_id = NodeNgram.ngram_id
func.count(NodeNgram.node_id).label("nd") # nd: n docs with term else:
) Syno = (session.query(NodeNgramNgram.ngram1_id,
.filter(NodeNgram.node_id.in_(docids_subquery)) NodeNgramNgram.ngram2_id)
.group_by(NodeNgram.ngram_id) .filter(NodeNgramNgram.node_id == groupings_id)
.all() .subquery()
) )
# { ngram_id => log(nd) } ngform_id = case([
log_nd_lookup = {row.ngram_id : log(row.nd) for row in n_docswith_ng} (Syno.c.ngram1_id != None, Syno.c.ngram1_id),
(Syno.c.ngram1_id == None, NodeNgram.ngram_id)
])
# tf for each couple (number of rows = N docs X M ngrams) # tf for each couple (number of rows = N docs X M ngrams)
tf_doc_ng = (session tf_doc_query = (session
.query( .query(
NodeNgram.ngram_id, ngform_id,
NodeNgram.node_id, NodeNgram.node_id,
func.sum(NodeNgram.weight).label("tf"), # tf: occurrences func.sum(NodeNgram.weight).label("tf"), # tf: occurrences
) )
.filter(NodeNgram.node_id.in_(docids_subquery))
.group_by(NodeNgram.node_id, NodeNgram.ngram_id) # select within docs of current corpus
.all() .join(docids_subquery,
docids_subquery.c.id == NodeNgram.node_id)
)
if groupings_id:
tf_doc_query = ( tf_doc_query
.outerjoin(Syno, Syno.c.ngram2_id == NodeNgram.ngram_id)
)
# now when we'll group_by the ngram2 freqs will be added to ngram1
if on_list_id:
Miamlist = aliased(NodeNgram)
tf_doc_query = ( tf_doc_query
.join(Miamlist, Miamlist.ngram_id == ngform_id)
.filter( Miamlist.node_id == on_list_id )
) )
# execute query to do our tf sum
tf_per_doc = tf_doc_query.group_by(NodeNgram.node_id, ngform_id).all()
# ex: [(128371, 9732, 1.0),
# (128383, 9740, 1.0),
# (128373, 9731, 1.0),
# (128376, 9734, 1.0),
# (128372, 9731, 1.0),
# (128383, 9733, 1.0),
# (128383, 9735, 1.0),
# (128389, 9734, 1.0),
# (8624, 9731, 1.0),
# (128382, 9740, 1.0),
# (128383, 9739, 1.0),
# (128383, 9736, 1.0),
# (128378, 9735, 1.0),
# (128375, 9733, 4.0),
# (128383, 9732, 1.0)]
# ^ ^ ^^ ^^
# ngram doc freq in this doc
# simultaneously count docs with given term (number of rows = M ngrams)
ndocswithngram = {}
for triple in tf_per_doc:
ng = triple[0]
doc = triple[1]
if ng in ndocswithngram:
ndocswithngram[ng] += 1
else:
ndocswithngram[ng] = 1
# print(ndocswithngram)
# store for use in formula
# { ngram_id => log(nd) }
log_nd_lookup = {ng : log(nd_count)
for (ng, nd_count) in ndocswithngram.items()}
# --------------------------------------------------------- # ---------------------------------------------------------
tfidfs = {} tfidfs = {}
log_tot_docs = log(total_docs) log_tot_docs = log(total_docs)
for (ngram_id, node_id, tf) in tf_doc_ng: for (ngram_id, node_id, tf) in tf_per_doc:
log_nd = log_nd_lookup[ngram_id] log_nd = log_nd_lookup[ngram_id]
# tfidfs[ngram_id] = tf * log(total_docs/nd) # tfidfs[ngram_id] = tf * log(total_docs/nd)
tfidfs[node_id, ngram_id] = tf * (log_tot_docs-log_nd) tfidfs[node_id, ngram_id] = tf * (log_tot_docs-log_nd)
......
...@@ -10,13 +10,15 @@ from sqlalchemy.sql.expression import case # for choice if ngram has mainform or ...@@ -10,13 +10,15 @@ from sqlalchemy.sql.expression import case # for choice if ngram has mainform or
def compute_coocs( corpus, def compute_coocs( corpus,
overwrite_id = None, overwrite_id = None,
just_pass_result= True, # just return the WeightedMatrix,
# (don't write to DB)
threshold = DEFAULT_COOC_THRESHOLD, threshold = DEFAULT_COOC_THRESHOLD,
groupings_id = None, groupings_id = None,
mainlist_id = None, on_list_id = None,
stoplist_id = None, stoplist_id = None,
start = None, start = None,
end = None, end = None,
symmetry_filter = True): symmetry_filter = False):
""" """
Count how often some extracted terms appear Count how often some extracted terms appear
together in a small context (document) together in a small context (document)
...@@ -46,7 +48,7 @@ def compute_coocs( corpus, ...@@ -46,7 +48,7 @@ def compute_coocs( corpus,
- threshold: on output cooc count (previously called hapax) - threshold: on output cooc count (previously called hapax)
- groupings_id: optional synonym relations to add all subform counts - groupings_id: optional synonym relations to add all subform counts
with their mainform's counts with their mainform's counts
- mainlist_id: mainlist to constrain the input ngrams - on_list_id: mainlist or maplist type, to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams - stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is already provided) (normally unnecessary if a mainlist is already provided)
- start, end: provide one or both temporal limits to filter on doc date - start, end: provide one or both temporal limits to filter on doc date
...@@ -62,9 +64,10 @@ def compute_coocs( corpus, ...@@ -62,9 +64,10 @@ def compute_coocs( corpus,
====================== ======================
each pair of ngrams sharing same doc (node_id) each pair of ngrams sharing same doc (node_id)
SELEC idxa.ngram_id, idxb.ngram_id SELEC idxa.ngram_id, idxb.ngram_id
FROM nodes_ngrams AS idxa, nodes_ngrams AS idxb FROM nodes_ngrams AS idxa
--------------------------------- ---------------------------------
WHERE idxa.node_id = idxb.node_id <== that's cooc JOIN nodes_ngrams AS idxb
ON idxa.node_id = idxb.node_id <== that's cooc
--------------------------------- ---------------------------------
AND idxa.ngram_id <> idxb.ngram_id AND idxa.ngram_id <> idxb.ngram_id
AND idxa.node_id = MY_DOC ; AND idxa.node_id = MY_DOC ;
...@@ -188,7 +191,7 @@ def compute_coocs( corpus, ...@@ -188,7 +191,7 @@ def compute_coocs( corpus,
# 4) INPUT FILTERS (reduce N before O(N²)) # 4) INPUT FILTERS (reduce N before O(N²))
if mainlist_id: if on_list_id:
m1 = aliased(NodeNgram) m1 = aliased(NodeNgram)
m2 = aliased(NodeNgram) m2 = aliased(NodeNgram)
...@@ -197,8 +200,8 @@ def compute_coocs( corpus, ...@@ -197,8 +200,8 @@ def compute_coocs( corpus,
.join(m1, m1.ngram_id == Xindex_ngform_id) .join(m1, m1.ngram_id == Xindex_ngform_id)
.join(m2, m2.ngram_id == Yindex_ngform_id) .join(m2, m2.ngram_id == Yindex_ngform_id)
.filter( m1.node_id == mainlist_id ) .filter( m1.node_id == on_list_id )
.filter( m2.node_id == mainlist_id ) .filter( m2.node_id == on_list_id )
) )
if stoplist_id: if stoplist_id:
...@@ -279,11 +282,16 @@ def compute_coocs( corpus, ...@@ -279,11 +282,16 @@ def compute_coocs( corpus,
shape_1 = len({pair[1] for pair in matrix.items}) shape_1 = len({pair[1] for pair in matrix.items})
print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1)) print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
if just_pass_result:
return matrix
else:
# 5) SAVE # 5) SAVE
# -------- # --------
# saving the parameters of the analysis in the Node JSON # saving the parameters of the analysis in the Node JSON
new_hyperdata = { 'corpus' : corpus.id, new_hyperdata = { 'corpus' : corpus.id,
'threshold': threshold } 'threshold': threshold }
if overwrite_id: if overwrite_id:
# overwrite pre-existing id # overwrite pre-existing id
the_cooc = cache.Node[overwrite_id] the_cooc = cache.Node[overwrite_id]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment