Commit eee27166 authored by Romain Loth's avatar Romain Loth

finished ngram workflow with groups (todo 'recount' button after changing groups

parent 95763e12
...@@ -6,15 +6,19 @@ digraph ngramflow { ...@@ -6,15 +6,19 @@ digraph ngramflow {
labelloc="t" ; labelloc="t" ;
"extracted_ngrams" -> "grouplist" ; "extracted_ngrams" -> "grouplist" ;
"extracted_ngrams" -> "occs+tfidfs" ; "extracted_ngrams" -> "occs+ti_rank" ;
"main_user_stoplist" -> "stoplist" ; "project stoplist (todo)" -> "stoplist" ;
"stoplist" -> "mainlist" ; "stoplist" -> "mainlist" ;
"occs+tfidfs" -> "mainlist" [label=" TFIDF_LIMIT"]; "occs+ti_rank" -> "mainlist" [label=" TI_RANK_LIMIT"];
"mainlist" -> "coocs" [label=" COOCS_THRESHOLD"] ; "mainlist" -> "coocs" [label=" COOCS_THRESHOLD"] ;
"coocs" -> "specificity" ; "coocs" -> "specificity" ;
"specificity" -> "maplist" [label="MAPLIST_LIMIT\nMONOGRAM_PART"]; "specificity" -> "maplist" [label="MAPLIST_LIMIT\nMONOGRAM_PART"];
"mainlist" -> "tfidf" ;
"tfidf" -> "explore" [label="doc relations with all map and candidates"];
"maplist" -> "explore" ; "maplist" -> "explore" ;
"grouplist" -> "maplist" ; "grouplist" -> "occs+ti_rank" ;
"grouplist" -> "coocs" ;
"grouplist" -> "tfidf" ;
} }
doc/schemas/ngram_parsing_flow.png

52.5 KB | W: | H:

doc/schemas/ngram_parsing_flow.png

75.9 KB | W: | H:

doc/schemas/ngram_parsing_flow.png
doc/schemas/ngram_parsing_flow.png
doc/schemas/ngram_parsing_flow.png
doc/schemas/ngram_parsing_flow.png
  • 2-up
  • Swipe
  • Onion skin
...@@ -129,8 +129,9 @@ def parse_extract_indexhyperdata(corpus): ...@@ -129,8 +129,9 @@ def parse_extract_indexhyperdata(corpus):
print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id)) print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
# -> write local tfidf similarities to Node and NodeNodeNgram # -> write local tfidf similarities to Node and NodeNodeNgram
# TODO only on mainlist ltfidf_id = compute_tfidf_local(corpus,
ltfidf_id = compute_tfidf_local(corpus) on_list_id=mainlist_id,
groupings_id = group_id)
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id)) print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
# => used for doc <=> ngram association # => used for doc <=> ngram association
......
...@@ -377,12 +377,18 @@ def compute_ti_ranking(corpus, ...@@ -377,12 +377,18 @@ def compute_ti_ranking(corpus,
def compute_tfidf_local(corpus, overwrite_id=None): def compute_tfidf_local(corpus,
on_list_id=None,
groupings_id=None,
overwrite_id=None):
""" """
Calculates tfidf similarity of each (doc, ngram) couple, within the current corpus Calculates tfidf similarity of each (doc, ngram) couple, within the current corpus
Parameters: Parameters:
- the corpus itself - the corpus itself
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
- on_list_id: mainlist or maplist type, to constrain the input ngrams
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced) (the Node and its previous NodeNodeNgram rows will be replaced)
""" """
...@@ -398,36 +404,94 @@ def compute_tfidf_local(corpus, overwrite_id=None): ...@@ -398,36 +404,94 @@ def compute_tfidf_local(corpus, overwrite_id=None):
# N # N
total_docs = session.query(docids_subquery).count() total_docs = session.query(docids_subquery).count()
# number of docs with given term (number of rows = M ngrams)
n_docswith_ng = (session
.query(
NodeNgram.ngram_id,
func.count(NodeNgram.node_id).label("nd") # nd: n docs with term
)
.filter(NodeNgram.node_id.in_(docids_subquery))
.group_by(NodeNgram.ngram_id)
.all()
)
# { ngram_id => log(nd) } # define the counted form
log_nd_lookup = {row.ngram_id : log(row.nd) for row in n_docswith_ng} if not groupings_id:
ngform_id = NodeNgram.ngram_id
else:
Syno = (session.query(NodeNgramNgram.ngram1_id,
NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == groupings_id)
.subquery()
)
ngform_id = case([
(Syno.c.ngram1_id != None, Syno.c.ngram1_id),
(Syno.c.ngram1_id == None, NodeNgram.ngram_id)
])
# tf for each couple (number of rows = N docs X M ngrams) # tf for each couple (number of rows = N docs X M ngrams)
tf_doc_ng = (session tf_doc_query = (session
.query( .query(
NodeNgram.ngram_id, ngform_id,
NodeNgram.node_id, NodeNgram.node_id,
func.sum(NodeNgram.weight).label("tf"), # tf: occurrences func.sum(NodeNgram.weight).label("tf"), # tf: occurrences
) )
.filter(NodeNgram.node_id.in_(docids_subquery))
.group_by(NodeNgram.node_id, NodeNgram.ngram_id) # select within docs of current corpus
.all() .join(docids_subquery,
docids_subquery.c.id == NodeNgram.node_id)
) )
if groupings_id:
tf_doc_query = ( tf_doc_query
.outerjoin(Syno, Syno.c.ngram2_id == NodeNgram.ngram_id)
)
# now when we'll group_by the ngram2 freqs will be added to ngram1
if on_list_id:
Miamlist = aliased(NodeNgram)
tf_doc_query = ( tf_doc_query
.join(Miamlist, Miamlist.ngram_id == ngform_id)
.filter( Miamlist.node_id == on_list_id )
)
# execute query to do our tf sum
tf_per_doc = tf_doc_query.group_by(NodeNgram.node_id, ngform_id).all()
# ex: [(128371, 9732, 1.0),
# (128383, 9740, 1.0),
# (128373, 9731, 1.0),
# (128376, 9734, 1.0),
# (128372, 9731, 1.0),
# (128383, 9733, 1.0),
# (128383, 9735, 1.0),
# (128389, 9734, 1.0),
# (8624, 9731, 1.0),
# (128382, 9740, 1.0),
# (128383, 9739, 1.0),
# (128383, 9736, 1.0),
# (128378, 9735, 1.0),
# (128375, 9733, 4.0),
# (128383, 9732, 1.0)]
# ^ ^ ^^ ^^
# ngram doc freq in this doc
# simultaneously count docs with given term (number of rows = M ngrams)
ndocswithngram = {}
for triple in tf_per_doc:
ng = triple[0]
doc = triple[1]
if ng in ndocswithngram:
ndocswithngram[ng] += 1
else:
ndocswithngram[ng] = 1
# print(ndocswithngram)
# store for use in formula
# { ngram_id => log(nd) }
log_nd_lookup = {ng : log(nd_count)
for (ng, nd_count) in ndocswithngram.items()}
# --------------------------------------------------------- # ---------------------------------------------------------
tfidfs = {} tfidfs = {}
log_tot_docs = log(total_docs) log_tot_docs = log(total_docs)
for (ngram_id, node_id, tf) in tf_doc_ng: for (ngram_id, node_id, tf) in tf_per_doc:
log_nd = log_nd_lookup[ngram_id] log_nd = log_nd_lookup[ngram_id]
# tfidfs[ngram_id] = tf * log(total_docs/nd) # tfidfs[ngram_id] = tf * log(total_docs/nd)
tfidfs[node_id, ngram_id] = tf * (log_tot_docs-log_nd) tfidfs[node_id, ngram_id] = tf * (log_tot_docs-log_nd)
......
...@@ -18,7 +18,7 @@ def compute_coocs( corpus, ...@@ -18,7 +18,7 @@ def compute_coocs( corpus,
stoplist_id = None, stoplist_id = None,
start = None, start = None,
end = None, end = None,
symmetry_filter = True): symmetry_filter = False):
""" """
Count how often some extracted terms appear Count how often some extracted terms appear
together in a small context (document) together in a small context (document)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment