Commit 58aa990d authored by Romain Loth's avatar Romain Loth

maplist generation and better estimates for constants (thresholds)

parent 744ec7f1
...@@ -9,9 +9,11 @@ LISTTYPES = { ...@@ -9,9 +9,11 @@ LISTTYPES = {
'STOPLIST' : UnweightedList, 'STOPLIST' : UnweightedList,
'MAINLIST' : UnweightedList, 'MAINLIST' : UnweightedList,
'MAPLIST' : UnweightedList, 'MAPLIST' : UnweightedList,
'SPECIFICITY' : WeightedList,
'OCCURRENCES' : WeightedContextIndex, 'OCCURRENCES' : WeightedContextIndex,
'COOCCURRENCES': WeightedMatrix, 'COOCCURRENCES': WeightedMatrix,
'TFIDF-CORPUS' : WeightedContextIndex, 'TFIDF-CORPUS' : WeightedContextIndex,
'TFIDF-GLOBAL' : WeightedContextIndex,
} }
NODETYPES = [ NODETYPES = [
...@@ -92,10 +94,21 @@ RESOURCETYPES = [ ...@@ -92,10 +94,21 @@ RESOURCETYPES = [
# }, # },
] ]
# linguistic extraction parameters # linguistic extraction parameters ---------------------------------------------
DEFAULT_TFIDF_CUTOFF_RATIO = .55 # for MAINLIST maximum terms DEFAULT_TFIDF_CUTOFF_RATIO = .45 # MAINLIST maximum terms in %
DEFAULT_TFIDF_HARD_LIMIT = 1000 # for MAINLIST maximum terms
DEFAULT_COOC_THRESHOLD = 4 # for COOCCURRENCES node DEFAULT_TFIDF_HARD_LIMIT = 750 # MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD = 5 # inclusive minimum for COOCS coefs
# (makes COOCS more sparse)
DEFAULT_MAPLIST_MAX = 300 # MAPLIST maximum terms
DEFAULT_MAPLIST_MONOGRAMS_RATIO = .5 # part of monograms in MAPLIST
# (NB: used to be 0.005 !!)
# ------------------------------------------------------------------------------
# other parameters # other parameters
# default number of docs POSTed to scrappers.views.py # default number of docs POSTed to scrappers.views.py
......
from .parsing import parse from .parsing import parse
from .ngrams_extraction import extract_ngrams from .ngrams_extraction import extract_ngrams
# in usual run order
from .list_stop import do_stoplist from .list_stop import do_stoplist
from .ngram_scores import compute_occurrences_local, compute_tfidf from .ngram_scores import compute_occurrences_local, compute_tfidf
from .list_main import do_mainlist from .list_main import do_mainlist
from .ngram_coocs_tempo import compute_coocs from .ngram_coocs_tempo import compute_coocs
from .score_specificity import compute_specificity from .score_specificity import compute_specificity
from .list_map import compute_mapList # TEST from .list_map import do_maplist # TEST
from .ngram_groups import compute_groups from .ngram_groups import compute_groups
from gargantext.util.db import session from gargantext.util.db import session
...@@ -40,10 +41,19 @@ def parse_extract(corpus): ...@@ -40,10 +41,19 @@ def parse_extract(corpus):
# ------------------------------- # -------------------------------
print('CORPUS #%d: [%s] starting ngram lists computation' % (corpus.id, t())) print('CORPUS #%d: [%s] starting ngram lists computation' % (corpus.id, t()))
# -> stoplist: compute + write (=> Node and NodeNgram) # -> stoplist: filter + write (to Node and NodeNgram)
stop_id = compute_stop(corpus) stop_id = do_stoplist(corpus)
print('CORPUS #%d: [%s] new stoplist node #%i' % (corpus.id, t(), stop_id)) print('CORPUS #%d: [%s] new stoplist node #%i' % (corpus.id, t(), stop_id))
# -> write groups to Node and NodeNgramNgram
group_id = compute_groups(corpus, stoplist_id = None)
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id = compute_occurrences_local(corpus)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# ------------
# -> write local tfidf to Node and NodeNodeNgram # -> write local tfidf to Node and NodeNodeNgram
ltfidf_id = compute_tfidf(corpus, scope="local") ltfidf_id = compute_tfidf(corpus, scope="local")
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id)) print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
...@@ -52,31 +62,27 @@ def parse_extract(corpus): ...@@ -52,31 +62,27 @@ def parse_extract(corpus):
gtfidf_id = compute_tfidf(corpus, scope="global") gtfidf_id = compute_tfidf(corpus, scope="global")
print('CORPUS #%d: [%s] new globaltfidf node #%i' % (corpus.id, t(), gtfidf_id)) print('CORPUS #%d: [%s] new globaltfidf node #%i' % (corpus.id, t(), gtfidf_id))
# -> mainlist: compute + write (to Node and NodeNgram) # -> mainlist: filter + write (to Node and NodeNgram)
mainlist_id = mainlist_filter(corpus, tfidf_id = gtfidf_id, stoplist_id = stop_id) mainlist_id = do_mainlist(corpus,
tfidf_id = gtfidf_id,
stoplist_id = stop_id)
print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id)) print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
# ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram) # -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, stop_id = None) cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id)
print('CORPUS #%d: [%s] new cooccs node #%i' % (corpus.id, t(), cooc_id)) print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))
# ?? specificity: compute + write (=> NodeNodeNgram) # -> specificity: compute + write (=> NodeNodeNgram)
spec_id = compute_specificity(corpus, cooc_id=cooc_id) spec_id = compute_specificity(corpus, cooc_id=cooc_id)
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), cooc_id)) print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
# ?? maplist: compute + write (to Node and NodeNgram) # ?? maplist: compute + write (to Node and NodeNgram)
# map_id = compute_stop(corpus) map_id = do_maplist(corpus,
# print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id)) mainlist_id = mainlist_id,
specificity_id=spec_id,
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf grouplist_id=group_id)
occ_id = compute_occurrences_local(corpus) print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# -> write groups to Node and NodeNgramNgram
group_id = compute_groups(corpus, stoplist_id = None)
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
def t(): def t():
......
...@@ -2,26 +2,38 @@ from gargantext.models import Node, NodeNgram, NodeNodeNgram ...@@ -2,26 +2,38 @@ from gargantext.models import Node, NodeNgram, NodeNodeNgram
from gargantext.util.db import session from gargantext.util.db import session
from gargantext.util.lists import UnweightedList from gargantext.util.lists import UnweightedList
from sqlalchemy import desc from sqlalchemy import desc
from gargantext.constants import DEFAULT_TFIDF_CUTOFF_RATIO, DEFAULT_TFIDF_HARD_LIMIT from gargantext.constants import DEFAULT_TFIDF_CUTOFF_RATIO, \
from math import floor DEFAULT_TFIDF_HARD_LIMIT
def do_mainlist(corpus, tfidf_id=None, stoplist_id=None, def do_mainlist(corpus,
overwrite_id = None,
tfidf_id=None, stoplist_id=None,
hard_limit=DEFAULT_TFIDF_HARD_LIMIT, hard_limit=DEFAULT_TFIDF_HARD_LIMIT,
ratio_limit=DEFAULT_TFIDF_CUTOFF_RATIO ratio_limit=DEFAULT_TFIDF_CUTOFF_RATIO
): ):
""" """
Select terms for the mainlist according to a global tfidf and stoplist. Select top n terms according to a global tfidf ranking and stoplist filter.
The number of selected terms will be: The number of selected terms will be:
min(hard_limit, number_of_terms * ratio_limit) min(hard_limit, number_of_terms * ratio_limit)
NB : We use a global tfidf node where the values are global but the ngrams NB : We use a global tfidf node where the values are global but the ngrams
are already selected (== only within this corpus documents). are already selected (== only within this corpus documents).
TO DISCUSS: allow influence of the local tfidf scores too
Parameters: Parameters:
2 limits are useful to set a maximum amount of picked terms - the corpus itself
- ratio_limit: relative to the number of distinct ngrams [0,1] - a tfidf score for ranking the ngrams
- hard_limit: absolute value [default: 1000] - a stoplist for filtering some ngrams
- overwrite_id: optional id of a pre-existing MAINLIST node for this corpus
(the Node and its previous NodeNgram rows will be replaced)
+ 2 limits to set the amount of picked terms:
- ratio_limit ∈ [0,1]: a ratio relative to the number of distinct ngrams
(default: 0.55)
- hard_limit: an absolute max value
(default: 1000)
""" """
# retrieve helper nodes if not provided # retrieve helper nodes if not provided
...@@ -61,20 +73,26 @@ def do_mainlist(corpus, tfidf_id=None, stoplist_id=None, ...@@ -61,20 +73,26 @@ def do_mainlist(corpus, tfidf_id=None, stoplist_id=None,
nb_ngrams = ordered_filtered_tfidf.count() nb_ngrams = ordered_filtered_tfidf.count()
# apply ratio to find smallest limit # apply ratio to find smallest limit
our_limit = min(hard_limit, floor(nb_ngrams * ratio_limit)) our_limit = min(hard_limit, round(nb_ngrams * ratio_limit))
print("MAINLIST: keeping %i ngrams out of %i" % (our_limit,nb_ngrams))
# DB retrieve up to limit => MAINLIST # DB retrieve up to limit => MAINLIST
top_ngrams_ids = ordered_filtered_tfidf.limit(our_limit).all() top_ngrams_ids = ordered_filtered_tfidf.limit(our_limit).all()
# now create the new MAINLIST node if overwrite_id:
mainlist = corpus.add_child( # overwrite pre-existing id
typename = "MAINLIST", the_id = overwrite_id
name = "Mainlist (in:%s)" % corpus.name[0:10] # mainlist = cache.Node[overwrite_id]
) else:
session.add(mainlist) # now create the new MAINLIST node
session.commit() mainlist = corpus.add_child(
typename = "MAINLIST",
the_id = mainlist.id name = "Mainlist (in:%s)" % corpus.id
)
session.add(mainlist)
session.commit()
the_id = mainlist.id
# create UnweightedList object and save (=> new NodeNgram rows) # create UnweightedList object and save (=> new NodeNgram rows)
UnweightedList(top_ngrams_ids).save(the_id) UnweightedList(top_ngrams_ids).save(the_id)
......
from gargantext.util.db import * """
from gargantext.util.db_cache import * Selects a subset of corpus ngrams to use in the graph map.
from gargantext.constants import * """
from gargantext.models.ngrams import Ngram, NodeNgram,\ from gargantext.models.ngrams import Node, Ngram, NodeNgram, \
NodeNodeNgram, NodeNgramNgram NodeNgramNgram, NodeNodeNgram
from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache
from gargantext.util.lists import UnweightedList
from sqlalchemy import desc
from gargantext.constants import DEFAULT_MAPLIST_MAX,\
DEFAULT_MAPLIST_MONOGRAMS_RATIO
def do_maplist(corpus,
overwrite_id = None,
mainlist_id = None,
specificity_id = None,
grouplist_id = None,
limit=DEFAULT_MAPLIST_MAX,
monograms_part=DEFAULT_MAPLIST_MONOGRAMS_RATIO
):
'''
According to Specificities and mainlist
from sqlalchemy.sql import func Parameters:
from sqlalchemy import desc, asc, or_, and_, Date, cast, select - mainlist_id (starting point, already cleaned of stoplist terms)
from sqlalchemy import literal_column - specificity_id (ranking factor)
from sqlalchemy.orm import aliased - grouplist_id (filtering grouped ones)
- overwrite_id: optional if preexisting MAPLIST node to overwrite
from gargantext.util.toolchain.ngram_tools import insert_ngrams
import csv
def compute_mapList(corpus_id,limit=500,n=1, session=None): + 2 constants to modulate the terms choice
''' - limit for the amount of picked terms
According to Specificities and stoplist, - monograms_part: a ratio of terms with only one lexical unit to keep
''' '''
if not (mainlist_id and specificity_id and grouplist_id):
monograms_part = 0.005 raise ValueError("Please provide mainlist_id, specificity_id and grouplist_id")
monograms_limit = round(limit * monograms_part) monograms_limit = round(limit * monograms_part)
multigrams_limit = limit - monograms_limit multigrams_limit = limit - monograms_limit
print("MAPLIST: monograms_limit =", monograms_limit)
print("MAPLIST: multigrams_limit = ", multigrams_limit)
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id) #dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
list_main_id = session.query(Node.id).filter( mainterms_subquery = (session
Node.typename == "MAINLIST", # we want only terms within mainlist
Node.parent_id == corpus_id).first() .query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == mainlist_id)
list_stop_id = session.query(Node.id).filter( .subquery()
Node.typename == "STOPLIST", )
Node.parent_id == corpus_id).first()
primary_groupterms_subquery = (session
list_group_id = session.query(Node.id).filter( # we want only primary terms (ngram1)
Node.typename == "GROUPLIST", .query(NodeNgramNgram.ngram1_id)
Node.parent_id == corpus_id).first() .filter(NodeNgramNgram.node_id == grouplist_id)
.subquery()
score_spec_id = session.query(Node.id).filter( )
Node.typename == "SPECIFICITY",
Node.parent_id == corpus_id).first() ScoreSpec=aliased(NodeNgram)
# specificity-ranked
ListMain=aliased(NodeNgram) query = (session.query(ScoreSpec.ngram_id)
ListStop=aliased(NodeNgram)
ListGroup=aliased(NodeNgramNgram)
ScoreSpec=aliased(NodeNodeNgram)
# FIXME outerjoin does not work with current SqlAlchemy
# lines below the query do the job but it can be improved
query = (session.query(ScoreSpec.ngram_id, ScoreSpec.score)
.join(ListMain, ScoreSpec.ngram_id == ListMain.ngram_id)
.join(Ngram, Ngram.id == ScoreSpec.ngram_id) .join(Ngram, Ngram.id == ScoreSpec.ngram_id)
#.outerjoin(ListGroup, Group.ngramy_id == ScoreSpec.ngram_id) .filter(ScoreSpec.node_id == specificity_id)
#.outerjoin(ListStop, Stop.ngram_id == ScoreSpec.ngram_id) .filter(ScoreSpec.ngram_id.in_(mainterms_subquery))
.filter(ListMain.node_id == list_main_id) .filter(ScoreSpec.ngram_id.in_(primary_groupterms_subquery))
#.filter(ListGroup.node_id == list_group_id)
#.filter(ListStop.node_id == list_stop_id)
.filter(ScoreSpec.nodex_id == score_spec_id)
) )
top_monograms = (query top_monograms = (query
.filter(Ngram.n == 1) .filter(Ngram.n == 1)
.order_by(desc(ScoreSpec.score)) .order_by(desc(ScoreSpec.weight))
.limit(monograms_limit) .limit(monograms_limit)
.all()
) )
top_multigrams = (query top_multigrams = (query
.filter(Ngram.n >= 2) .filter(Ngram.n >= 2)
.order_by(desc(ScoreSpec.score)) .order_by(desc(ScoreSpec.weight))
.limit(multigrams_limit) .limit(multigrams_limit)
.all()
) )
stop_ngrams = (session.query(NodeNgram.ngram_id) print("MAPLIST: top_monograms =", len(top_monograms))
.filter(NodeNgram.node_id == list_stop_id) print("MAPLIST: top_multigrams = ", len(top_multigrams))
.all()
) # NEW MAPLIST NODE
# -----------------
grouped_ngrams = (session.query(NodeNgramNgram.ngramy_id) # saving the parameters of the analysis in the Node JSON
.filter(NodeNgramNgram.node_id == list_group_id) new_hyperdata = { 'corpus': corpus.id,
.all() 'limit' : limit,
'monograms_part' : monograms_part
}
if overwrite_id:
# overwrite pre-existing node
the_maplist = cache.Node[overwrite_id]
the_maplist.hyperdata = new_hyperdata
the_maplist.save_hyperdata()
session.commit()
the_id = overwrite_id
else:
# create a new maplist node
the_maplist = corpus.add_child(
name="Maplist (in %i)" % corpus.id,
typename="MAPLIST",
hyperdata = new_hyperdata
) )
session.add(the_maplist)
list_map_id = session.query(Node.id).filter(
Node.parent_id==corpus_id,
Node.typename == "MAPLIST"
).first()
if list_map_id == None:
corpus = cache.Node[corpus_id]
user_id = corpus.user_id
list_map = Node(name="MAPLIST", parent_id=corpus_id, user_id=user_id, typename="MAPLIST")
session.add(list_map)
session.commit() session.commit()
list_map_id = list_map.id the_id = the_maplist.id
# create UnweightedList object and save (=> new NodeNgram rows)
session.query(NodeNgram).filter(NodeNgram.node_id==list_map_id).delete() datalist = UnweightedList(
session.commit() [res.ngram_id for res in top_monograms + top_multigrams]
)
data = zip(
[list_map_id for i in range(1,limit)] # save
, [n[0] for n in list(top_multigrams) + list(top_monograms) datalist.save(the_id)
if (n[0],) not in list(stop_ngrams)
] # dbg.show('MapList computed')
, [1 for i in range(1,limit)]
)
#print([d for d in data])
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
dbg.show('MapList computed')
return the_id
from gargantext.util.db import * """
from gargantext.util.db_cache import * Creates a filtering list for corpus ngrams.
from gargantext.constants import * (implementation: regexp + "master" stoplist)
"""
from gargantext.util.db import session, aliased, func
from gargantext.util.lists import WeightedMatrix
from gargantext.models import User, Node, Ngram, NodeNgram from gargantext.models import User, Node, Ngram, NodeNgram
from gargantext.util.db import session, func
from gargantext.constants import LISTTYPES
from re import compile
from sqlalchemy import desc
import re def is_stop_word(ngram, stop_words=None):
from sqlalchemy import desc, asc
#from ngram.tools import insert_ngrams
def isStopWord(ngram, stop_words=None):
''' '''
ngram :: (Int, String) => (ngram_id, ngram_terms) ngram :: (Int, String) => (ngram_id, ngram_terms)
stop_words :: Set of String stop_words :: Set of String
(to avoid SQL query each time isStopWord is invoked, get in as parameter) (to avoid SQL query each time is_stop_word is invoked, get in as parameter)
''' '''
word = ngram[1] word = ngram[1]
...@@ -41,7 +39,7 @@ def isStopWord(ngram, stop_words=None): ...@@ -41,7 +39,7 @@ def isStopWord(ngram, stop_words=None):
, "(.*)(travers)(.*)" , "(.*)(travers)(.*)"
, "(.*)(:|\|)(.*)" , "(.*)(:|\|)(.*)"
] : ] :
compiled_regexes.append(re.compile(regex)) compiled_regexes.append(compile(regex))
for format_regex in compiled_regexes: for format_regex in compiled_regexes:
if format_regex.match(word): if format_regex.match(word):
...@@ -61,32 +59,27 @@ def create_gargantua_resources(): ...@@ -61,32 +59,27 @@ def create_gargantua_resources():
session.add(stopList) session.add(stopList)
session.commit() session.commit()
def compute_stop(corpus, stopList_id=None, debug=False): def do_stoplist(corpus, overwrite_id=None):
''' '''
Create list of stop words. Create list of stop words.
TODO do a function to get all stop words with social scores TODO do a function to get all stop words with social scores
Parameters:
- overwrite_id: optional preexisting STOPLIST node to overwrite
''' '''
# Get the StopList if it exist or create a new one # Get preexisting StopList if provided in overwrite_id param
if overwrite_id:
stoplist_id = overwrite_id
# At this step of development, a new StopList should be created # At this step of development, a new StopList should be created
if stopList_id == None: else:
stopList_id = session.query(Node.id).filter( stoplist = corpus.add_child(
Node.parent_id==corpus.id, name="Stoplist (in:%s)" % corpus.id,
Node.typename == "STOPLIST" typename="STOPLIST"
).first() )
if stopList_id == None: session.add(stoplist)
stopList = Node(name="STOPLIST",
parent_id=corpus.id,
user_id=corpus.user_id,
typename="STOPLIST")
session.add(stopList)
session.commit()
stopList_id = stopList.id
# For tests only
if debug == True:
session.query(Node).filter(Node.id==stopList_id).delete()
session.commit() session.commit()
stoplist_id = stoplist.id
# Get common resources, all common StopWords on the platform # Get common resources, all common StopWords on the platform
## First get the id of the StopList of Gargantua super user ## First get the id of the StopList of Gargantua super user
...@@ -107,23 +100,23 @@ def compute_stop(corpus, stopList_id=None, debug=False): ...@@ -107,23 +100,23 @@ def compute_stop(corpus, stopList_id=None, debug=False):
## Get the ngrams ## Get the ngrams
## ngrams :: [(Int, String, Int)] ## ngrams :: [(Int, String, Int)]
frequency = func.count( NodeNgram.weight ) ngrams = (session.query( Ngram.id, Ngram.terms)
ngrams = (session.query( Ngram.id, Ngram.terms, frequency )
.join( NodeNgram, NodeNgram.ngram_id == Ngram.id ) .join( NodeNgram, NodeNgram.ngram_id == Ngram.id )
.join( Node, Node.id == NodeNgram.node_id ) .join( Node, Node.id == NodeNgram.node_id )
.filter( Node.parent_id == corpus.id, .filter( Node.parent_id == corpus.id,
Node.typename == "DOCUMENT") Node.typename == "DOCUMENT")
.group_by( Ngram.id ) .group_by( Ngram.id )
.order_by( desc( frequency ) )
#.limit(limit) #.limit(limit)
.all() .all()
) )
ngrams_to_stop = filter(lambda x: isStopWord(x,stop_words=stop_words), ngrams) ngrams_to_stop = filter(
lambda x: is_stop_word(x,stop_words=stop_words), ngrams
)
# print([n for n in ngrams_to_stop]) # print([n for n in ngrams_to_stop])
stop = LISTTYPES["STOPLIST"]({ n[0] : -1 for n in ngrams_to_stop}) stop = LISTTYPES["STOPLIST"]({ n[0] : -1 for n in ngrams_to_stop})
# stop = LISTTYPES["STOPLIST"]([n[0] for n in ngrams_to_stop]) # stop = LISTTYPES["STOPLIST"]([n[0] for n in ngrams_to_stop])
stop.save(stopList_id) stop.save(stoplist_id)
return stopList_id return stoplist_id
from gargantext.models import Node, NodeNgram, NodeNgramNgram from gargantext.models import Node, NodeNgram, NodeNgramNgram
from gargantext.util.lists import WeightedMatrix from gargantext.util.lists import WeightedMatrix
from gargantext.util.db import session, aliased, func from gargantext.util.db import session, aliased, func
from gargantext.constants import DEFAULT_COOC_THRESHOLD from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD
def compute_coocs(corpus, def compute_coocs(corpus,
threshold = DEFAULT_COOC_THRESHOLD, overwrite_id = None,
weighted = False, threshold = DEFAULT_COOC_THRESHOLD,
our_id = None, mainlist_id = None,
stop_id = None, stoplist_id = None,
symmetry_filter = True): symmetry_filter = True):
""" """
Count how often some extracted terms appear Count how often some extracted terms appear
together in a small context (document) together in a small context (document)
throughout a larger context (corpus). throughout a larger context (corpus).
node_id | ngram_id | weight ngram1_id | ngram2_id | ucooc | wcooc | [NodeNgram] [NodeNgramNgram]
--------+----------+-------- ----------+-----------+-------+-------+
MYDOC | 487 | 1 => 487 | 294 | 1 | 4 | node_id | ngram_id | weight ngram1_id | ngram2_id | score |
MYDOC | 294 | 3 --------+----------+-------- ----------+-----------+-------+
MYDOCA | 487 | 1 => 487 | 294 | 2 |
MYDOCA | 294 | 3
MYDOCB | 487 | 1
MYDOCB | 294 | 4
Fill that info in DB: Fill that info in DB:
- a *new* COOCCURRENCES node - a *new* COOCCURRENCES node
...@@ -25,14 +30,16 @@ def compute_coocs(corpus, ...@@ -25,14 +30,16 @@ def compute_coocs(corpus,
worse case complexity ~ O(N²/2) with N = number of ngrams worse case complexity ~ O(N²/2) with N = number of ngrams
If a mainlist is provided, we filter doc ngrams to those also in the list.
Parameters: Parameters:
- threshold: on output ucooc count (previously called hapax) - the corpus node
- weighted: if False normal cooc to be saved as result - overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
if True weighted cooc (experimental) (all hyperdata and previous NodeNgramNgram rows will be replaced)
- stop_id: stoplist for filtering input ngrams - threshold: on output cooc count (previously called hapax)
- TODO cvalue_id: allow a metric as input filter - mainlist_id: mainlist to constrain the input ngrams
- TODO n_min, n_max : filter on Ngram.n (aka length of ngram) - stoplist_id: stoplist for filtering input ngrams
- TODO start, end : filter on document date (normally unnecessary if a mainlist is provided)
(deprecated parameters) (deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present - field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
...@@ -54,14 +61,17 @@ def compute_coocs(corpus, ...@@ -54,14 +61,17 @@ def compute_coocs(corpus,
coocs for each doc : coocs for each doc :
- each given pair like (termA, termB) will likely appear several times - each given pair like (termA, termB) will likely appear several times
=> we do GROUP BY (x1.ngram_id, x2.ngram_id) => we do GROUP BY (x1.ngram_id, x2.ngram_id)
- normally we can count unique appearances of the pair (ucooc) - we count unique appearances of the pair (cooc)
- we can count sum of sum of weights in the pair (wcooc or cofreq)
TODO
====
use WeightedMatrix
""" """
# - TODO cvalue_id: allow a metric as additional input filter
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO start, end : filter on document date
# - TODO weighted: if False normal cooc to be saved as result
# if True weighted cooc (experimental)
# /!\ big combinatorial complexity /!\ # /!\ big combinatorial complexity /!\
# pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1 # pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1
# 1.859.408 lignes pour la requête cooc simple # 1.859.408 lignes pour la requête cooc simple
...@@ -94,10 +104,22 @@ def compute_coocs(corpus, ...@@ -94,10 +104,22 @@ def compute_coocs(corpus,
# 2) INPUT FILTERS (reduce N before O(N²)) # 2) INPUT FILTERS (reduce N before O(N²))
# £TODO add possibility to restrict to the mainlist # £TODO add possibility to restrict to the mainlist
if stop_id: if mainlist_id:
main_subquery = (
session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == mainlist_id)
.subquery()
)
coocs_query = ( coocs_query
.filter( x1.ngram_id.in_(main_subquery) )
.filter( x2.ngram_id.in_(main_subquery) )
)
if stoplist_id:
stop_subquery = ( stop_subquery = (
session.query(NodeNgram.ngram_id) session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == stop_id) .filter(NodeNgram.node_id == stoplist_id)
.subquery() .subquery()
) )
...@@ -128,30 +150,36 @@ def compute_coocs(corpus, ...@@ -128,30 +150,36 @@ def compute_coocs(corpus,
# 3) OUTPUT FILTERS # 3) OUTPUT FILTERS
# ------------------ # ------------------
# threshold # threshold
# coocs_query = coocs_query.having(ucooc >= threshold)
coocs_query = coocs_query.having(ucooc > threshold)
# 4) EXECUTE QUERY # 4) EXECUTE QUERY
# ---------------- # ----------------
# => storage in our matrix structure # => storage in our matrix structure
matrix = WeightedMatrix(coocs_query.all()) matrix = WeightedMatrix(coocs_query.all())
# fyi
# shape_0 = len({pair[0] for pair in matrix.items})
# shape_1 = len({pair[1] for pair in matrix.items})
# print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
# 5) SAVE # 5) SAVE
# -------- # --------
if our_id: # saving the parameters of the analysis in the Node JSON
# use pre-existing id new_hyperdata = { 'corpus': corpus.id,
the_id = our_id 'threshold': threshold }
if overwrite_id:
# overwrite pre-existing id
the_cooc = cache.Node[overwrite_id]
the_cooc.hyperdata = new_hyperdata
the_cooc.save_hyperdata()
session.commit()
the_id = overwrite_id
else: else:
# create the new cooc node # create the new cooc node
the_cooc = Node( the_cooc = corpus.add_child(
typename = "COOCCURRENCES", typename = "COOCCURRENCES",
name = "Coocs (in:%s)" % corpus.name[0:10], name = "Coocs (in:%s)" % corpus.name[0:10],
parent_id = corpus.id, hyperdata = new_hyperdata,
user_id = corpus.user_id,
# saving the parameters of the analysis in the Node JSON
hyperdata = { 'corpus': corpus.id,
'threshold': threshold }
) )
session.add(the_cooc) session.add(the_cooc)
session.commit() session.commit()
......
from gargantext.models import Node, NodeNgramNgram """
from gargantext.util.db import session For initial ngram groups via stemming
from gargantext.util.lists import Translations Exemple:
- groups['copper engrav'] = {'copper engraving':3, 'coppers engraver':1...}
- groups['post'] = {'poste':3, 'poster':5, 'postés':2...}
"""
from gargantext.models import Node, NodeNgramNgram
from gargantext.util.db import session
from gargantext.util.lists import Translations
# to convert fr => french :/ # to convert fr => french :/
from gargantext.util.languages import languages from gargantext.util.languages import languages
from nltk.stem.snowball import SnowballStemmer from re import split as resplit
from re import split as resplit from collections import defaultdict, Counter
from collections import defaultdict, Counter from nltk.stem.snowball import SnowballStemmer
def prepare_stemmers(corpus): def prepare_stemmers(corpus):
""" """
...@@ -22,7 +29,7 @@ def prepare_stemmers(corpus): ...@@ -22,7 +29,7 @@ def prepare_stemmers(corpus):
stemmers_by_lg[lgiso2] = SnowballStemmer(lgname) stemmers_by_lg[lgiso2] = SnowballStemmer(lgname)
return stemmers_by_lg return stemmers_by_lg
def compute_groups(corpus, stoplist_id = None): def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
""" """
1) Use a stemmer/lemmatizer to group forms if they have same stem/lemma 1) Use a stemmer/lemmatizer to group forms if they have same stem/lemma
2) Create an empty GROUPLIST node (for a list of "synonym" ngrams) 2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
...@@ -98,17 +105,21 @@ def compute_groups(corpus, stoplist_id = None): ...@@ -98,17 +105,21 @@ def compute_groups(corpus, stoplist_id = None):
del my_groups del my_groups
# 2) Create the list node # 2) the list node
the_group = Node() if overwrite_id:
the_group.typename = "GROUPLIST" # overwrite pre-existing id
the_group.name = "Group (src:%s)" % corpus.name[0:10] the_id = overwrite_id
the_group.parent_id = corpus.id # could use corpus.parent_id if free list # or create the new id
the_group.user_id = corpus.user_id else:
the_group = corpus.add_child(
# and save the node typename = "GROUPLIST",
session.add(the_group) name = "Group (src:%s)" % corpus.name[0:10]
session.commit() )
the_id = the_group.id
# and save the node
session.add(the_group)
session.commit()
the_id = the_group.id
# 3) Save each grouping couple to DB thanks to Translations.save() table # 3) Save each grouping couple to DB thanks to Translations.save() table
ndngng_list = Translations( ndngng_list = Translations(
......
"""
Computes ngram scores with 3 ranking functions:
- the simple sum of occurrences inside the corpus
- the tfidf inside the corpus
- the global tfidf for all corpora having same source
FIXME: "having the same source" means we need to select inside hyperdata
with a (perhaps costly) JSON query: WHERE hyperdata->'resources' @> ...
"""
from gargantext.models import Node, NodeNgram, NodeNodeNgram from gargantext.models import Node, NodeNgram, NodeNodeNgram
from gargantext.util.db import session, bulk_insert from gargantext.util.db import session, bulk_insert, func # = sqlalchemy.func like sum() or count()
from sqlalchemy import text from sqlalchemy import text # for query from raw SQL statement
from math import log
# £TODO # £TODO
# from gargantext.util.lists import WeightedContextIndex # from gargantext.util.lists import WeightedContextIndex
from gargantext.util.db import func # = sqlalchemy.func like sum() or count()
from math import log
def compute_occurrences_local(corpus): def compute_occurrences_local(corpus, overwrite_id = None):
""" """
Calculates sum of occs per ngram within corpus Calculates sum of occs per ngram within corpus
(used as info in the ngrams table view)
? optimize ? OCCS here could be calculated simultaneously within TFIDF-CORPUS loop
Parameters:
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
""" """
# 1) all the doc_ids of our corpus (scope of counts for filter) # 1) all the doc_ids of our corpus (scope of counts for filter)
...@@ -37,32 +52,41 @@ def compute_occurrences_local(corpus): ...@@ -37,32 +52,41 @@ def compute_occurrences_local(corpus):
# ^^^^ ^^^ # ^^^^ ^^^
# ngram_id sum_wei # ngram_id sum_wei
# create the new OCCURRENCES node
occnode = Node() if overwrite_id:
occnode.typename = "OCCURRENCES" # overwrite pre-existing id
occnode.name = "occ_sums (in:%s)" % corpus.id the_id = overwrite_id
occnode.parent_id = corpus.id # occnode = cache.Node[overwrite_id]
occnode.user_id = corpus.user_id else:
session.add(occnode) # create the new OCCURRENCES node
session.commit() occnode = corpus.add_child(
typename = "OCCURRENCES",
name = "occ_sums (in:%s)" % corpus.id
)
session.add(occnode)
session.commit()
the_id = occnode.id
# reflect that in NodeNodeNgrams (could be NodeNgram but harmony with tfidf) # reflect that in NodeNodeNgrams (could be NodeNgram but harmony with tfidf)
# £TODO replace bulk_insert by something like WeightedContextMatrix.save() # £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert( bulk_insert(
NodeNodeNgram, NodeNodeNgram,
('node1_id' , 'node2_id', 'ngram_id', 'score'), ('node1_id' , 'node2_id', 'ngram_id', 'score'),
((occnode.id, corpus.id, res[0], res[1]) for res in occ_sums) ((the_id, corpus.id, res[0], res[1]) for res in occ_sums)
) )
return occnode.id return the_id
def compute_tfidf(corpus, scope="local"): def compute_tfidf(corpus, scope="local", overwrite_id=None):
""" """
Calculates tfidf within the current corpus Calculates tfidf within the current corpus
Parameter: Parameters:
- the corpus itself
- scope: {"local" or "global"} - scope: {"local" or "global"}
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
""" """
# local <=> within this corpus # local <=> within this corpus
...@@ -121,23 +145,27 @@ def compute_tfidf(corpus, scope="local"): ...@@ -121,23 +145,27 @@ def compute_tfidf(corpus, scope="local"):
tfidfs[ngram_id] = tf * (log_tot_docs-log(nd)) tfidfs[ngram_id] = tf * (log_tot_docs-log(nd))
# ------------------------------------------------- # -------------------------------------------------
# create the new TFIDF-CORPUS node if overwrite_id:
tfidf_nd = Node(parent_id = corpus.id, user_id = corpus.user_id) the_id = overwrite_id
if scope == "local": else:
tfidf_nd.typename = "TFIDF-CORPUS" # create the new TFIDF-XXXX node
tfidf_nd.name = "tfidf-c (in:%s)" % corpus.id tfidf_nd = corpus.add_child()
elif scope == "global": if scope == "local":
tfidf_nd.typename = "TFIDF-GLOBAL" tfidf_nd.typename = "TFIDF-CORPUS"
tfidf_nd.name = "tfidf-g (in type:%s)" % this_source_type tfidf_nd.name = "tfidf-c (in:%s)" % corpus.id
session.add(tfidf_nd) elif scope == "global":
session.commit() tfidf_nd.typename = "TFIDF-GLOBAL"
tfidf_nd.name = "tfidf-g (in type:%s)" % this_source_type
session.add(tfidf_nd)
session.commit()
the_id = tfidf_nd.id
# reflect that in NodeNodeNgrams # reflect that in NodeNodeNgrams
# £TODO replace bulk_insert by something like WeightedContextMatrix.save() # £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert( bulk_insert(
NodeNodeNgram, NodeNodeNgram,
('node1_id' , 'node2_id', 'ngram_id', 'score'), ('node1_id', 'node2_id','ngram_id', 'score'),
((tfidf_nd.id, corpus.id, ng, tfidfs[ng]) for ng in tfidfs) ((the_id, corpus.id, ng, tfidfs[ng]) for ng in tfidfs)
) )
return tfidf_nd.id return the_id
from gargantext.util.db import session, aliased, func """
from gargantext.util.db_cache import * Computes a specificity metric from the ngram cooccurrence matrix.
from gargantext.constants import * + SAVE => WeightedList => NodeNgram
"""
# from gargantext.util.analysis.cooccurrences import do_cooc from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram
from gargantext.util.db import session, aliased, func, bulk_insert
from gargantext.models import Node, Ngram, NodeNgramNgram, NodeNodeNgram from gargantext.util.lists import WeightedList
from collections import defaultdict
import pandas as pd from pandas import DataFrame
from collections import defaultdict
def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
def compute_specificity(corpus, cooc_id, limit=100):
''' '''
Compute the specificity, simple calculus. Compute the specificity, simple calculus.
Parameters:
- cooc_id: mandatory id of a cooccurrences node to use as base
- overwrite_id: optional preexisting specificity node to overwrite
''' '''
cooccurrences = (session.query(NodeNgramNgram) cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id) .filter(NodeNgramNgram.node_id==cooc_id)
# no filtering: new choice filter on tfidf before creation
# .order_by(NodeNgramNgram.weight)
# .limit(limit)
) )
# no filtering: new choice cooc already filtered on tfidf before creation
matrix = defaultdict(lambda : defaultdict(float)) matrix = defaultdict(lambda : defaultdict(float))
...@@ -30,7 +31,9 @@ def compute_specificity(corpus, cooc_id, limit=100): ...@@ -30,7 +31,9 @@ def compute_specificity(corpus, cooc_id, limit=100):
nb_ngrams = len(matrix) nb_ngrams = len(matrix)
d = pd.DataFrame(matrix).fillna(0) print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
d = DataFrame(matrix).fillna(0)
# proba (x/y) ( <= on divise chaque colonne par son total) # proba (x/y) ( <= on divise chaque colonne par son total)
d = d / d.sum(axis=0) d = d / d.sum(axis=0)
...@@ -74,28 +77,27 @@ def compute_specificity(corpus, cooc_id, limit=100): ...@@ -74,28 +77,27 @@ def compute_specificity(corpus, cooc_id, limit=100):
# ---------------- # ----------------
# specificity node # specificity node
node = session.query(Node).filter( if overwrite_id:
Node.parent_id==corpus.id, # overwrite pre-existing id
Node.typename == "SPECIFICITY" the_id = overwrite_id
).first() session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id==the_id).delete()
session.commit()
if node == None: else:
user_id = corpus.user_id specnode = corpus.add_child(
node = Node(name="Specif (in:%i)" % corpus.id, typename = "SPECIFICITY",
parent_id=corpus.id, name = "Specif (in:%s)" % corpus.id
user_id=user_id, )
typename="SPECIFICITY") session.add(specnode)
session.add(node)
session.commit() session.commit()
the_id = specnode.id
data = zip( [node.id] * nb_ngrams # print(v)
, [corpus.id] * nb_ngrams
, v.index.tolist()
, v.values.tolist()
)
session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id==node.id).delete()
session.commit()
bulk_insert(NodeNodeNgram, ['node1_id', 'node2_id', 'ngram_id', 'score'], [d for d in data]) data = WeightedList(
zip( v.index.tolist()
, v.values.tolist()
)
)
data.save(the_id)
return(node.id) return(the_id)
...@@ -94,6 +94,7 @@ def project(request, project_id): ...@@ -94,6 +94,7 @@ def project(request, project_id):
) )
session.add(corpus) session.add(corpus)
session.commit() session.commit()
# parse_extract: fileparsing -> ngram extraction -> lists
scheduled(parse_extract)(corpus.id) scheduled(parse_extract)(corpus.id)
# corpora within this project # corpora within this project
...@@ -101,16 +102,26 @@ def project(request, project_id): ...@@ -101,16 +102,26 @@ def project(request, project_id):
sourcename2corpora = defaultdict(list) sourcename2corpora = defaultdict(list)
for corpus in corpora: for corpus in corpora:
# we only consider the first resource of the corpus to determine its type # we only consider the first resource of the corpus to determine its type
resource = corpus.resources()[0] resources = corpus.resources()
resource_type_name = RESOURCETYPES[resource['type']]['name'] if len(resources):
resource = resources[0]
resource_type_name = RESOURCETYPES[resource['type']]['name']
else:
print("(WARNING) PROJECT view: no listed resource")
# add some data for the viewer # add some data for the viewer
corpus.count = corpus.children('DOCUMENT').count() corpus.count = corpus.children('DOCUMENT').count()
status = corpus.status() status = corpus.status()
if status is not None and not status['complete']: if status is not None and not status['complete']:
corpus.status_message = '(in progress: %s, %d complete)' % ( if not status['error']:
status['action'].replace('_', ' '), corpus.status_message = '(in progress: %s, %d complete)' % (
status['progress'], status['action'].replace('_', ' '),
) status['progress'],
)
else:
corpus.status_message = '(aborted: "%s" after %i docs)' % (
status['error'][-1],
status['progress']
)
else: else:
corpus.status_message = '' corpus.status_message = ''
# add # add
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment