Commit 744ec7f1 authored by Romain Loth's avatar Romain Loth

mainlist creation

parent 89c8268c
......@@ -93,7 +93,9 @@ RESOURCETYPES = [
]
# linguistic extraction parameters
DEFAULT_COOC_THRESHOLD = 4
DEFAULT_TFIDF_CUTOFF_RATIO = .55 # for MAINLIST maximum terms
DEFAULT_TFIDF_HARD_LIMIT = 1000 # for MAINLIST maximum terms
DEFAULT_COOC_THRESHOLD = 4 # for COOCCURRENCES node
# other parameters
# default number of docs POSTed to scrappers.views.py
......
from .parsing import parse
from .ngrams_extraction import extract_ngrams
from .list_stop import compute_stop
from .list_stop import do_stoplist
from .ngram_scores import compute_occurrences_local, compute_tfidf
from .list_main import do_mainlist
from .ngram_coocs_tempo import compute_coocs
from .score_specificity import compute_specificity
from .list_map import compute_mapList # TEST
......@@ -24,6 +25,12 @@ def parse_extract(corpus):
# apply actions
print('CORPUS #%d' % (corpus.id))
parse(corpus)
# was there an error in the process ?
if corpus.status()['error']:
print("ERROR: aborting parse_extract for corpus #%i" % corpus_id)
return None
print('CORPUS #%d: parsed' % (corpus.id))
extract_ngrams(corpus)
print('CORPUS #%d: extracted ngrams' % (corpus.id))
......@@ -45,16 +52,16 @@ def parse_extract(corpus):
gtfidf_id = compute_tfidf(corpus, scope="global")
print('CORPUS #%d: [%s] new globaltfidf node #%i' % (corpus.id, t(), gtfidf_id))
# ?? mainlist: compute + write (to Node and NodeNgram)
# mainlist_id = compute_mainlist(corpus)
# print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
# -> mainlist: compute + write (to Node and NodeNgram)
mainlist_id = mainlist_filter(corpus, tfidf_id = gtfidf_id, stoplist_id = stop_id)
print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id = compute_coocs(corpus, stop_id = None)
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, stop_id = None)
print('CORPUS #%d: [%s] new cooccs node #%i' % (corpus.id, t(), cooc_id))
# ?? specificity: compute + write (=> NodeNodeNgram)
spec_id = compute_specificity(cooc_id=cooc_id, corpus=corpus)
spec_id = compute_specificity(corpus, cooc_id=cooc_id)
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), cooc_id))
# ?? maplist: compute + write (to Node and NodeNgram)
......@@ -70,5 +77,7 @@ def parse_extract(corpus):
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
def t():
return datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
from gargantext.models import Node, NodeNgram, NodeNodeNgram
from gargantext.util.db import session
from gargantext.util.lists import UnweightedList
from sqlalchemy import desc
from gargantext.constants import DEFAULT_TFIDF_CUTOFF_RATIO, DEFAULT_TFIDF_HARD_LIMIT
from math import floor
def do_mainlist(corpus, tfidf_id=None, stoplist_id=None,
hard_limit=DEFAULT_TFIDF_HARD_LIMIT,
ratio_limit=DEFAULT_TFIDF_CUTOFF_RATIO
):
"""
Select terms for the mainlist according to a global tfidf and stoplist.
The number of selected terms will be:
min(hard_limit, number_of_terms * ratio_limit)
NB : We use a global tfidf node where the values are global but the ngrams
are already selected (== only within this corpus documents).
Parameters:
2 limits are useful to set a maximum amount of picked terms
- ratio_limit: relative to the number of distinct ngrams [0,1]
- hard_limit: absolute value [default: 1000]
"""
# retrieve helper nodes if not provided
if not tfidf_id:
tfidf_id = session.query(Node.id).filter(
Node.typename == "TFIDF-GLOBAL",
Node.parent_id == corpus.id
).first()
if not tfidf_id:
raise ValueError("MAINLIST: TFIDF node needed for mainlist creation")
if not stoplist_id:
stoplist_id = session.query(Node.id).filter(
Node.typename == "STOPLIST",
Node.parent_id == corpus.id
).first()
if not stoplist_id:
raise ValueError("MAINLIST: STOPLIST node needed for mainlist creation")
# the ngrams we don't want
# NOTE: keep sure we do this only once during the ngram initial workflow
stopterms_subquery = (session
.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == stoplist_id)
.subquery()
)
# tfidf-ranked query
ordered_filtered_tfidf = (session
.query(NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.node1_id == tfidf_id)
.filter(~ NodeNodeNgram.ngram_id.in_(stopterms_subquery))
.order_by(desc(NodeNodeNgram.score))
)
# total count
nb_ngrams = ordered_filtered_tfidf.count()
# apply ratio to find smallest limit
our_limit = min(hard_limit, floor(nb_ngrams * ratio_limit))
# DB retrieve up to limit => MAINLIST
top_ngrams_ids = ordered_filtered_tfidf.limit(our_limit).all()
# now create the new MAINLIST node
mainlist = corpus.add_child(
typename = "MAINLIST",
name = "Mainlist (in:%s)" % corpus.name[0:10]
)
session.add(mainlist)
session.commit()
the_id = mainlist.id
# create UnweightedList object and save (=> new NodeNgram rows)
UnweightedList(top_ngrams_ids).save(the_id)
return the_id
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment