Commit 6260e8c1 authored by Romain Loth's avatar Romain Loth

maplist generation and better estimates for constants (thresholds)

parent 57517450
......@@ -9,9 +9,11 @@ LISTTYPES = {
'STOPLIST' : UnweightedList,
'MAINLIST' : UnweightedList,
'MAPLIST' : UnweightedList,
'SPECIFICITY' : WeightedList,
'OCCURRENCES' : WeightedContextIndex,
'COOCCURRENCES': WeightedMatrix,
'TFIDF-CORPUS' : WeightedContextIndex,
'TFIDF-GLOBAL' : WeightedContextIndex,
}
NODETYPES = [
......@@ -92,10 +94,21 @@ RESOURCETYPES = [
# },
]
# linguistic extraction parameters
DEFAULT_TFIDF_CUTOFF_RATIO = .55 # for MAINLIST maximum terms
DEFAULT_TFIDF_HARD_LIMIT = 1000 # for MAINLIST maximum terms
DEFAULT_COOC_THRESHOLD = 4 # for COOCCURRENCES node
# linguistic extraction parameters ---------------------------------------------
DEFAULT_TFIDF_CUTOFF_RATIO = .45 # MAINLIST maximum terms in %
DEFAULT_TFIDF_HARD_LIMIT = 750 # MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD = 5 # inclusive minimum for COOCS coefs
# (makes COOCS more sparse)
DEFAULT_MAPLIST_MAX = 300 # MAPLIST maximum terms
DEFAULT_MAPLIST_MONOGRAMS_RATIO = .5 # part of monograms in MAPLIST
# (NB: used to be 0.005 !!)
# ------------------------------------------------------------------------------
# other parameters
# default number of docs POSTed to scrappers.views.py
......
from .parsing import parse
from .ngrams_extraction import extract_ngrams
# in usual run order
from .list_stop import do_stoplist
from .ngram_scores import compute_occurrences_local, compute_tfidf
from .list_main import do_mainlist
from .ngram_coocs_tempo import compute_coocs
from .score_specificity import compute_specificity
from .list_map import compute_mapList # TEST
from .list_map import do_maplist # TEST
from .ngram_groups import compute_groups
from gargantext.util.db import session
......@@ -40,10 +41,19 @@ def parse_extract(corpus):
# -------------------------------
print('CORPUS #%d: [%s] starting ngram lists computation' % (corpus.id, t()))
# -> stoplist: compute + write (=> Node and NodeNgram)
stop_id = compute_stop(corpus)
# -> stoplist: filter + write (to Node and NodeNgram)
stop_id = do_stoplist(corpus)
print('CORPUS #%d: [%s] new stoplist node #%i' % (corpus.id, t(), stop_id))
# -> write groups to Node and NodeNgramNgram
group_id = compute_groups(corpus, stoplist_id = None)
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id = compute_occurrences_local(corpus)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# ------------
# -> write local tfidf to Node and NodeNodeNgram
ltfidf_id = compute_tfidf(corpus, scope="local")
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
......@@ -52,31 +62,27 @@ def parse_extract(corpus):
gtfidf_id = compute_tfidf(corpus, scope="global")
print('CORPUS #%d: [%s] new globaltfidf node #%i' % (corpus.id, t(), gtfidf_id))
# -> mainlist: compute + write (to Node and NodeNgram)
mainlist_id = mainlist_filter(corpus, tfidf_id = gtfidf_id, stoplist_id = stop_id)
# -> mainlist: filter + write (to Node and NodeNgram)
mainlist_id = do_mainlist(corpus,
tfidf_id = gtfidf_id,
stoplist_id = stop_id)
print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
# ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, stop_id = None)
print('CORPUS #%d: [%s] new cooccs node #%i' % (corpus.id, t(), cooc_id))
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id)
print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))
# ?? specificity: compute + write (=> NodeNodeNgram)
# -> specificity: compute + write (=> NodeNodeNgram)
spec_id = compute_specificity(corpus, cooc_id=cooc_id)
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), cooc_id))
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
# ?? maplist: compute + write (to Node and NodeNgram)
# map_id = compute_stop(corpus)
# print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id = compute_occurrences_local(corpus)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# -> write groups to Node and NodeNgramNgram
group_id = compute_groups(corpus, stoplist_id = None)
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
map_id = do_maplist(corpus,
mainlist_id = mainlist_id,
specificity_id=spec_id,
grouplist_id=group_id)
print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
def t():
......
......@@ -2,26 +2,38 @@ from gargantext.models import Node, NodeNgram, NodeNodeNgram
from gargantext.util.db import session
from gargantext.util.lists import UnweightedList
from sqlalchemy import desc
from gargantext.constants import DEFAULT_TFIDF_CUTOFF_RATIO, DEFAULT_TFIDF_HARD_LIMIT
from math import floor
from gargantext.constants import DEFAULT_TFIDF_CUTOFF_RATIO, \
DEFAULT_TFIDF_HARD_LIMIT
def do_mainlist(corpus, tfidf_id=None, stoplist_id=None,
def do_mainlist(corpus,
overwrite_id = None,
tfidf_id=None, stoplist_id=None,
hard_limit=DEFAULT_TFIDF_HARD_LIMIT,
ratio_limit=DEFAULT_TFIDF_CUTOFF_RATIO
):
"""
Select terms for the mainlist according to a global tfidf and stoplist.
Select top n terms according to a global tfidf ranking and stoplist filter.
The number of selected terms will be:
min(hard_limit, number_of_terms * ratio_limit)
NB : We use a global tfidf node where the values are global but the ngrams
are already selected (== only within this corpus documents).
TO DISCUSS: allow influence of the local tfidf scores too
Parameters:
2 limits are useful to set a maximum amount of picked terms
- ratio_limit: relative to the number of distinct ngrams [0,1]
- hard_limit: absolute value [default: 1000]
- the corpus itself
- a tfidf score for ranking the ngrams
- a stoplist for filtering some ngrams
- overwrite_id: optional id of a pre-existing MAINLIST node for this corpus
(the Node and its previous NodeNgram rows will be replaced)
+ 2 limits to set the amount of picked terms:
- ratio_limit ∈ [0,1]: a ratio relative to the number of distinct ngrams
(default: 0.55)
- hard_limit: an absolute max value
(default: 1000)
"""
# retrieve helper nodes if not provided
......@@ -61,20 +73,26 @@ def do_mainlist(corpus, tfidf_id=None, stoplist_id=None,
nb_ngrams = ordered_filtered_tfidf.count()
# apply ratio to find smallest limit
our_limit = min(hard_limit, floor(nb_ngrams * ratio_limit))
our_limit = min(hard_limit, round(nb_ngrams * ratio_limit))
print("MAINLIST: keeping %i ngrams out of %i" % (our_limit,nb_ngrams))
# DB retrieve up to limit => MAINLIST
top_ngrams_ids = ordered_filtered_tfidf.limit(our_limit).all()
# now create the new MAINLIST node
mainlist = corpus.add_child(
typename = "MAINLIST",
name = "Mainlist (in:%s)" % corpus.name[0:10]
)
session.add(mainlist)
session.commit()
the_id = mainlist.id
if overwrite_id:
# overwrite pre-existing id
the_id = overwrite_id
# mainlist = cache.Node[overwrite_id]
else:
# now create the new MAINLIST node
mainlist = corpus.add_child(
typename = "MAINLIST",
name = "Mainlist (in:%s)" % corpus.id
)
session.add(mainlist)
session.commit()
the_id = mainlist.id
# create UnweightedList object and save (=> new NodeNgram rows)
UnweightedList(top_ngrams_ids).save(the_id)
......
from gargantext.util.db import *
from gargantext.util.db_cache import *
from gargantext.constants import *
"""
Selects a subset of corpus ngrams to use in the graph map.
"""
from gargantext.models.ngrams import Ngram, NodeNgram,\
NodeNodeNgram, NodeNgramNgram
from gargantext.models.ngrams import Node, Ngram, NodeNgram, \
NodeNgramNgram, NodeNodeNgram
from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache
from gargantext.util.lists import UnweightedList
from sqlalchemy import desc
from gargantext.constants import DEFAULT_MAPLIST_MAX,\
DEFAULT_MAPLIST_MONOGRAMS_RATIO
def do_maplist(corpus,
overwrite_id = None,
mainlist_id = None,
specificity_id = None,
grouplist_id = None,
limit=DEFAULT_MAPLIST_MAX,
monograms_part=DEFAULT_MAPLIST_MONOGRAMS_RATIO
):
'''
According to Specificities and mainlist
from sqlalchemy.sql import func
from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from sqlalchemy import literal_column
from sqlalchemy.orm import aliased
from gargantext.util.toolchain.ngram_tools import insert_ngrams
import csv
Parameters:
- mainlist_id (starting point, already cleaned of stoplist terms)
- specificity_id (ranking factor)
- grouplist_id (filtering grouped ones)
- overwrite_id: optional if preexisting MAPLIST node to overwrite
def compute_mapList(corpus_id,limit=500,n=1, session=None):
'''
According to Specificities and stoplist,
+ 2 constants to modulate the terms choice
- limit for the amount of picked terms
- monograms_part: a ratio of terms with only one lexical unit to keep
'''
monograms_part = 0.005
if not (mainlist_id and specificity_id and grouplist_id):
raise ValueError("Please provide mainlist_id, specificity_id and grouplist_id")
monograms_limit = round(limit * monograms_part)
multigrams_limit = limit - monograms_limit
print("MAPLIST: monograms_limit =", monograms_limit)
print("MAPLIST: multigrams_limit = ", multigrams_limit)
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
list_main_id = session.query(Node.id).filter(
Node.typename == "MAINLIST",
Node.parent_id == corpus_id).first()
list_stop_id = session.query(Node.id).filter(
Node.typename == "STOPLIST",
Node.parent_id == corpus_id).first()
list_group_id = session.query(Node.id).filter(
Node.typename == "GROUPLIST",
Node.parent_id == corpus_id).first()
score_spec_id = session.query(Node.id).filter(
Node.typename == "SPECIFICITY",
Node.parent_id == corpus_id).first()
ListMain=aliased(NodeNgram)
ListStop=aliased(NodeNgram)
ListGroup=aliased(NodeNgramNgram)
ScoreSpec=aliased(NodeNodeNgram)
# FIXME outerjoin does not work with current SqlAlchemy
# lines below the query do the job but it can be improved
query = (session.query(ScoreSpec.ngram_id, ScoreSpec.score)
.join(ListMain, ScoreSpec.ngram_id == ListMain.ngram_id)
mainterms_subquery = (session
# we want only terms within mainlist
.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == mainlist_id)
.subquery()
)
primary_groupterms_subquery = (session
# we want only primary terms (ngram1)
.query(NodeNgramNgram.ngram1_id)
.filter(NodeNgramNgram.node_id == grouplist_id)
.subquery()
)
ScoreSpec=aliased(NodeNgram)
# specificity-ranked
query = (session.query(ScoreSpec.ngram_id)
.join(Ngram, Ngram.id == ScoreSpec.ngram_id)
#.outerjoin(ListGroup, Group.ngramy_id == ScoreSpec.ngram_id)
#.outerjoin(ListStop, Stop.ngram_id == ScoreSpec.ngram_id)
.filter(ListMain.node_id == list_main_id)
#.filter(ListGroup.node_id == list_group_id)
#.filter(ListStop.node_id == list_stop_id)
.filter(ScoreSpec.nodex_id == score_spec_id)
.filter(ScoreSpec.node_id == specificity_id)
.filter(ScoreSpec.ngram_id.in_(mainterms_subquery))
.filter(ScoreSpec.ngram_id.in_(primary_groupterms_subquery))
)
top_monograms = (query
.filter(Ngram.n == 1)
.order_by(desc(ScoreSpec.score))
.order_by(desc(ScoreSpec.weight))
.limit(monograms_limit)
.all()
)
top_multigrams = (query
.filter(Ngram.n >= 2)
.order_by(desc(ScoreSpec.score))
.order_by(desc(ScoreSpec.weight))
.limit(multigrams_limit)
.all()
)
stop_ngrams = (session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == list_stop_id)
.all()
)
grouped_ngrams = (session.query(NodeNgramNgram.ngramy_id)
.filter(NodeNgramNgram.node_id == list_group_id)
.all()
print("MAPLIST: top_monograms =", len(top_monograms))
print("MAPLIST: top_multigrams = ", len(top_multigrams))
# NEW MAPLIST NODE
# -----------------
# saving the parameters of the analysis in the Node JSON
new_hyperdata = { 'corpus': corpus.id,
'limit' : limit,
'monograms_part' : monograms_part
}
if overwrite_id:
# overwrite pre-existing node
the_maplist = cache.Node[overwrite_id]
the_maplist.hyperdata = new_hyperdata
the_maplist.save_hyperdata()
session.commit()
the_id = overwrite_id
else:
# create a new maplist node
the_maplist = corpus.add_child(
name="Maplist (in %i)" % corpus.id,
typename="MAPLIST",
hyperdata = new_hyperdata
)
list_map_id = session.query(Node.id).filter(
Node.parent_id==corpus_id,
Node.typename == "MAPLIST"
).first()
if list_map_id == None:
corpus = cache.Node[corpus_id]
user_id = corpus.user_id
list_map = Node(name="MAPLIST", parent_id=corpus_id, user_id=user_id, typename="MAPLIST")
session.add(list_map)
session.add(the_maplist)
session.commit()
list_map_id = list_map.id
session.query(NodeNgram).filter(NodeNgram.node_id==list_map_id).delete()
session.commit()
data = zip(
[list_map_id for i in range(1,limit)]
, [n[0] for n in list(top_multigrams) + list(top_monograms)
if (n[0],) not in list(stop_ngrams)
]
, [1 for i in range(1,limit)]
)
#print([d for d in data])
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
dbg.show('MapList computed')
the_id = the_maplist.id
# create UnweightedList object and save (=> new NodeNgram rows)
datalist = UnweightedList(
[res.ngram_id for res in top_monograms + top_multigrams]
)
# save
datalist.save(the_id)
# dbg.show('MapList computed')
return the_id
from gargantext.util.db import *
from gargantext.util.db_cache import *
from gargantext.constants import *
from gargantext.util.db import session, aliased, func
from gargantext.util.lists import WeightedMatrix
"""
Creates a filtering list for corpus ngrams.
(implementation: regexp + "master" stoplist)
"""
from gargantext.models import User, Node, Ngram, NodeNgram
from gargantext.util.db import session, func
from gargantext.constants import LISTTYPES
from re import compile
from sqlalchemy import desc
import re
from sqlalchemy import desc, asc
#from ngram.tools import insert_ngrams
def isStopWord(ngram, stop_words=None):
def is_stop_word(ngram, stop_words=None):
'''
ngram :: (Int, String) => (ngram_id, ngram_terms)
stop_words :: Set of String
(to avoid SQL query each time isStopWord is invoked, get in as parameter)
(to avoid SQL query each time is_stop_word is invoked, get in as parameter)
'''
word = ngram[1]
......@@ -41,7 +39,7 @@ def isStopWord(ngram, stop_words=None):
, "(.*)(travers)(.*)"
, "(.*)(:|\|)(.*)"
] :
compiled_regexes.append(re.compile(regex))
compiled_regexes.append(compile(regex))
for format_regex in compiled_regexes:
if format_regex.match(word):
......@@ -61,32 +59,27 @@ def create_gargantua_resources():
session.add(stopList)
session.commit()
def compute_stop(corpus, stopList_id=None, debug=False):
def do_stoplist(corpus, overwrite_id=None):
'''
Create list of stop words.
TODO do a function to get all stop words with social scores
Parameters:
- overwrite_id: optional preexisting STOPLIST node to overwrite
'''
# Get the StopList if it exist or create a new one
# Get preexisting StopList if provided in overwrite_id param
if overwrite_id:
stoplist_id = overwrite_id
# At this step of development, a new StopList should be created
if stopList_id == None:
stopList_id = session.query(Node.id).filter(
Node.parent_id==corpus.id,
Node.typename == "STOPLIST"
).first()
if stopList_id == None:
stopList = Node(name="STOPLIST",
parent_id=corpus.id,
user_id=corpus.user_id,
typename="STOPLIST")
session.add(stopList)
session.commit()
stopList_id = stopList.id
# For tests only
if debug == True:
session.query(Node).filter(Node.id==stopList_id).delete()
else:
stoplist = corpus.add_child(
name="Stoplist (in:%s)" % corpus.id,
typename="STOPLIST"
)
session.add(stoplist)
session.commit()
stoplist_id = stoplist.id
# Get common resources, all common StopWords on the platform
## First get the id of the StopList of Gargantua super user
......@@ -107,23 +100,23 @@ def compute_stop(corpus, stopList_id=None, debug=False):
## Get the ngrams
## ngrams :: [(Int, String, Int)]
frequency = func.count( NodeNgram.weight )
ngrams = (session.query( Ngram.id, Ngram.terms, frequency )
ngrams = (session.query( Ngram.id, Ngram.terms)
.join( NodeNgram, NodeNgram.ngram_id == Ngram.id )
.join( Node, Node.id == NodeNgram.node_id )
.filter( Node.parent_id == corpus.id,
Node.typename == "DOCUMENT")
.group_by( Ngram.id )
.order_by( desc( frequency ) )
#.limit(limit)
.all()
)
ngrams_to_stop = filter(lambda x: isStopWord(x,stop_words=stop_words), ngrams)
ngrams_to_stop = filter(
lambda x: is_stop_word(x,stop_words=stop_words), ngrams
)
# print([n for n in ngrams_to_stop])
stop = LISTTYPES["STOPLIST"]({ n[0] : -1 for n in ngrams_to_stop})
# stop = LISTTYPES["STOPLIST"]([n[0] for n in ngrams_to_stop])
stop.save(stopList_id)
return stopList_id
stop.save(stoplist_id)
return stoplist_id
from gargantext.models import Node, NodeNgram, NodeNgramNgram
from gargantext.util.lists import WeightedMatrix
from gargantext.util.db import session, aliased, func
from gargantext.constants import DEFAULT_COOC_THRESHOLD
from gargantext.models import Node, NodeNgram, NodeNgramNgram
from gargantext.util.lists import WeightedMatrix
from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD
def compute_coocs(corpus,
threshold = DEFAULT_COOC_THRESHOLD,
weighted = False,
our_id = None,
stop_id = None,
overwrite_id = None,
threshold = DEFAULT_COOC_THRESHOLD,
mainlist_id = None,
stoplist_id = None,
symmetry_filter = True):
"""
Count how often some extracted terms appear
together in a small context (document)
throughout a larger context (corpus).
node_id | ngram_id | weight ngram1_id | ngram2_id | ucooc | wcooc |
--------+----------+-------- ----------+-----------+-------+-------+
MYDOC | 487 | 1 => 487 | 294 | 1 | 4 |
MYDOC | 294 | 3
[NodeNgram] [NodeNgramNgram]
node_id | ngram_id | weight ngram1_id | ngram2_id | score |
--------+----------+-------- ----------+-----------+-------+
MYDOCA | 487 | 1 => 487 | 294 | 2 |
MYDOCA | 294 | 3
MYDOCB | 487 | 1
MYDOCB | 294 | 4
Fill that info in DB:
- a *new* COOCCURRENCES node
......@@ -25,14 +30,16 @@ def compute_coocs(corpus,
worse case complexity ~ O(N²/2) with N = number of ngrams
If a mainlist is provided, we filter doc ngrams to those also in the list.
Parameters:
- threshold: on output ucooc count (previously called hapax)
- weighted: if False normal cooc to be saved as result
if True weighted cooc (experimental)
- stop_id: stoplist for filtering input ngrams
- TODO cvalue_id: allow a metric as input filter
- TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
- TODO start, end : filter on document date
- the corpus node
- overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
(all hyperdata and previous NodeNgramNgram rows will be replaced)
- threshold: on output cooc count (previously called hapax)
- mainlist_id: mainlist to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is provided)
(deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
......@@ -54,14 +61,17 @@ def compute_coocs(corpus,
coocs for each doc :
- each given pair like (termA, termB) will likely appear several times
=> we do GROUP BY (x1.ngram_id, x2.ngram_id)
- normally we can count unique appearances of the pair (ucooc)
- we can count sum of sum of weights in the pair (wcooc or cofreq)
- we count unique appearances of the pair (cooc)
TODO
====
use WeightedMatrix
"""
# - TODO cvalue_id: allow a metric as additional input filter
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO start, end : filter on document date
# - TODO weighted: if False normal cooc to be saved as result
# if True weighted cooc (experimental)
# /!\ big combinatorial complexity /!\
# pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1
# 1.859.408 lignes pour la requête cooc simple
......@@ -94,10 +104,22 @@ def compute_coocs(corpus,
# 2) INPUT FILTERS (reduce N before O(N²))
# £TODO add possibility to restrict to the mainlist
if stop_id:
if mainlist_id:
main_subquery = (
session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == mainlist_id)
.subquery()
)
coocs_query = ( coocs_query
.filter( x1.ngram_id.in_(main_subquery) )
.filter( x2.ngram_id.in_(main_subquery) )
)
if stoplist_id:
stop_subquery = (
session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == stop_id)
.filter(NodeNgram.node_id == stoplist_id)
.subquery()
)
......@@ -128,30 +150,36 @@ def compute_coocs(corpus,
# 3) OUTPUT FILTERS
# ------------------
# threshold
#
coocs_query = coocs_query.having(ucooc > threshold)
coocs_query = coocs_query.having(ucooc >= threshold)
# 4) EXECUTE QUERY
# ----------------
# => storage in our matrix structure
matrix = WeightedMatrix(coocs_query.all())
# fyi
# shape_0 = len({pair[0] for pair in matrix.items})
# shape_1 = len({pair[1] for pair in matrix.items})
# print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
# 5) SAVE
# --------
if our_id:
# use pre-existing id
the_id = our_id
# saving the parameters of the analysis in the Node JSON
new_hyperdata = { 'corpus': corpus.id,
'threshold': threshold }
if overwrite_id:
# overwrite pre-existing id
the_cooc = cache.Node[overwrite_id]
the_cooc.hyperdata = new_hyperdata
the_cooc.save_hyperdata()
session.commit()
the_id = overwrite_id
else:
# create the new cooc node
the_cooc = Node(
the_cooc = corpus.add_child(
typename = "COOCCURRENCES",
name = "Coocs (in:%s)" % corpus.name[0:10],
parent_id = corpus.id,
user_id = corpus.user_id,
# saving the parameters of the analysis in the Node JSON
hyperdata = { 'corpus': corpus.id,
'threshold': threshold }
hyperdata = new_hyperdata,
)
session.add(the_cooc)
session.commit()
......
from gargantext.models import Node, NodeNgramNgram
from gargantext.util.db import session
from gargantext.util.lists import Translations
"""
For initial ngram groups via stemming
Exemple:
- groups['copper engrav'] = {'copper engraving':3, 'coppers engraver':1...}
- groups['post'] = {'poste':3, 'poster':5, 'postés':2...}
"""
from gargantext.models import Node, NodeNgramNgram
from gargantext.util.db import session
from gargantext.util.lists import Translations
# to convert fr => french :/
from gargantext.util.languages import languages
from nltk.stem.snowball import SnowballStemmer
from re import split as resplit
from collections import defaultdict, Counter
from re import split as resplit
from collections import defaultdict, Counter
from nltk.stem.snowball import SnowballStemmer
def prepare_stemmers(corpus):
"""
......@@ -22,7 +29,7 @@ def prepare_stemmers(corpus):
stemmers_by_lg[lgiso2] = SnowballStemmer(lgname)
return stemmers_by_lg
def compute_groups(corpus, stoplist_id = None):
def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
"""
1) Use a stemmer/lemmatizer to group forms if they have same stem/lemma
2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
......@@ -98,17 +105,21 @@ def compute_groups(corpus, stoplist_id = None):
del my_groups
# 2) Create the list node
the_group = Node()
the_group.typename = "GROUPLIST"
the_group.name = "Group (src:%s)" % corpus.name[0:10]
the_group.parent_id = corpus.id # could use corpus.parent_id if free list
the_group.user_id = corpus.user_id
# and save the node
session.add(the_group)
session.commit()
the_id = the_group.id
# 2) the list node
if overwrite_id:
# overwrite pre-existing id
the_id = overwrite_id
# or create the new id
else:
the_group = corpus.add_child(
typename = "GROUPLIST",
name = "Group (src:%s)" % corpus.name[0:10]
)
# and save the node
session.add(the_group)
session.commit()
the_id = the_group.id
# 3) Save each grouping couple to DB thanks to Translations.save() table
ndngng_list = Translations(
......
"""
Computes ngram scores with 3 ranking functions:
- the simple sum of occurrences inside the corpus
- the tfidf inside the corpus
- the global tfidf for all corpora having same source
FIXME: "having the same source" means we need to select inside hyperdata
with a (perhaps costly) JSON query: WHERE hyperdata->'resources' @> ...
"""
from gargantext.models import Node, NodeNgram, NodeNodeNgram
from gargantext.util.db import session, bulk_insert
from sqlalchemy import text
from gargantext.util.db import session, bulk_insert, func # = sqlalchemy.func like sum() or count()
from sqlalchemy import text # for query from raw SQL statement
from math import log
# £TODO
# from gargantext.util.lists import WeightedContextIndex
from gargantext.util.db import func # = sqlalchemy.func like sum() or count()
from math import log
def compute_occurrences_local(corpus):
def compute_occurrences_local(corpus, overwrite_id = None):
"""
Calculates sum of occs per ngram within corpus
(used as info in the ngrams table view)
? optimize ? OCCS here could be calculated simultaneously within TFIDF-CORPUS loop
Parameters:
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
# 1) all the doc_ids of our corpus (scope of counts for filter)
......@@ -37,32 +52,41 @@ def compute_occurrences_local(corpus):
# ^^^^ ^^^
# ngram_id sum_wei
# create the new OCCURRENCES node
occnode = Node()
occnode.typename = "OCCURRENCES"
occnode.name = "occ_sums (in:%s)" % corpus.id
occnode.parent_id = corpus.id
occnode.user_id = corpus.user_id
session.add(occnode)
session.commit()
if overwrite_id:
# overwrite pre-existing id
the_id = overwrite_id
# occnode = cache.Node[overwrite_id]
else:
# create the new OCCURRENCES node
occnode = corpus.add_child(
typename = "OCCURRENCES",
name = "occ_sums (in:%s)" % corpus.id
)
session.add(occnode)
session.commit()
the_id = occnode.id
# reflect that in NodeNodeNgrams (could be NodeNgram but harmony with tfidf)
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert(
NodeNodeNgram,
('node1_id' , 'node2_id', 'ngram_id', 'score'),
((occnode.id, corpus.id, res[0], res[1]) for res in occ_sums)
((the_id, corpus.id, res[0], res[1]) for res in occ_sums)
)
return occnode.id
return the_id
def compute_tfidf(corpus, scope="local"):
def compute_tfidf(corpus, scope="local", overwrite_id=None):
"""
Calculates tfidf within the current corpus
Parameter:
Parameters:
- the corpus itself
- scope: {"local" or "global"}
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
# local <=> within this corpus
......@@ -121,23 +145,27 @@ def compute_tfidf(corpus, scope="local"):
tfidfs[ngram_id] = tf * (log_tot_docs-log(nd))
# -------------------------------------------------
# create the new TFIDF-CORPUS node
tfidf_nd = Node(parent_id = corpus.id, user_id = corpus.user_id)
if scope == "local":
tfidf_nd.typename = "TFIDF-CORPUS"
tfidf_nd.name = "tfidf-c (in:%s)" % corpus.id
elif scope == "global":
tfidf_nd.typename = "TFIDF-GLOBAL"
tfidf_nd.name = "tfidf-g (in type:%s)" % this_source_type
session.add(tfidf_nd)
session.commit()
if overwrite_id:
the_id = overwrite_id
else:
# create the new TFIDF-XXXX node
tfidf_nd = corpus.add_child()
if scope == "local":
tfidf_nd.typename = "TFIDF-CORPUS"
tfidf_nd.name = "tfidf-c (in:%s)" % corpus.id
elif scope == "global":
tfidf_nd.typename = "TFIDF-GLOBAL"
tfidf_nd.name = "tfidf-g (in type:%s)" % this_source_type
session.add(tfidf_nd)
session.commit()
the_id = tfidf_nd.id
# reflect that in NodeNodeNgrams
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert(
NodeNodeNgram,
('node1_id' , 'node2_id', 'ngram_id', 'score'),
((tfidf_nd.id, corpus.id, ng, tfidfs[ng]) for ng in tfidfs)
('node1_id', 'node2_id','ngram_id', 'score'),
((the_id, corpus.id, ng, tfidfs[ng]) for ng in tfidfs)
)
return tfidf_nd.id
return the_id
from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import *
from gargantext.constants import *
# from gargantext.util.analysis.cooccurrences import do_cooc
from gargantext.models import Node, Ngram, NodeNgramNgram, NodeNodeNgram
import pandas as pd
from collections import defaultdict
def compute_specificity(corpus, cooc_id, limit=100):
"""
Computes a specificity metric from the ngram cooccurrence matrix.
+ SAVE => WeightedList => NodeNgram
"""
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram
from gargantext.util.db import session, aliased, func, bulk_insert
from gargantext.util.lists import WeightedList
from collections import defaultdict
from pandas import DataFrame
def compute_specificity(corpus, cooc_id=None, overwrite_id = None):
'''
Compute the specificity, simple calculus.
Parameters:
- cooc_id: mandatory id of a cooccurrences node to use as base
- overwrite_id: optional preexisting specificity node to overwrite
'''
cooccurrences = (session.query(NodeNgramNgram)
.filter(NodeNgramNgram.node_id==cooc_id)
# no filtering: new choice filter on tfidf before creation
# .order_by(NodeNgramNgram.weight)
# .limit(limit)
)
# no filtering: new choice cooc already filtered on tfidf before creation
matrix = defaultdict(lambda : defaultdict(float))
......@@ -30,7 +31,9 @@ def compute_specificity(corpus, cooc_id, limit=100):
nb_ngrams = len(matrix)
d = pd.DataFrame(matrix).fillna(0)
print("SPECIFICITY: computing on %i ngrams" % nb_ngrams)
d = DataFrame(matrix).fillna(0)
# proba (x/y) ( <= on divise chaque colonne par son total)
d = d / d.sum(axis=0)
......@@ -74,28 +77,27 @@ def compute_specificity(corpus, cooc_id, limit=100):
# ----------------
# specificity node
node = session.query(Node).filter(
Node.parent_id==corpus.id,
Node.typename == "SPECIFICITY"
).first()
if node == None:
user_id = corpus.user_id
node = Node(name="Specif (in:%i)" % corpus.id,
parent_id=corpus.id,
user_id=user_id,
typename="SPECIFICITY")
session.add(node)
if overwrite_id:
# overwrite pre-existing id
the_id = overwrite_id
session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id==the_id).delete()
session.commit()
else:
specnode = corpus.add_child(
typename = "SPECIFICITY",
name = "Specif (in:%s)" % corpus.id
)
session.add(specnode)
session.commit()
the_id = specnode.id
data = zip( [node.id] * nb_ngrams
, [corpus.id] * nb_ngrams
, v.index.tolist()
, v.values.tolist()
)
session.query(NodeNodeNgram).filter(NodeNodeNgram.node1_id==node.id).delete()
session.commit()
# print(v)
bulk_insert(NodeNodeNgram, ['node1_id', 'node2_id', 'ngram_id', 'score'], [d for d in data])
data = WeightedList(
zip( v.index.tolist()
, v.values.tolist()
)
)
data.save(the_id)
return(node.id)
return(the_id)
......@@ -94,6 +94,7 @@ def project(request, project_id):
)
session.add(corpus)
session.commit()
# parse_extract: fileparsing -> ngram extraction -> lists
scheduled(parse_extract)(corpus.id)
# corpora within this project
......@@ -101,16 +102,26 @@ def project(request, project_id):
sourcename2corpora = defaultdict(list)
for corpus in corpora:
# we only consider the first resource of the corpus to determine its type
resource = corpus.resources()[0]
resource_type_name = RESOURCETYPES[resource['type']]['name']
resources = corpus.resources()
if len(resources):
resource = resources[0]
resource_type_name = RESOURCETYPES[resource['type']]['name']
else:
print("(WARNING) PROJECT view: no listed resource")
# add some data for the viewer
corpus.count = corpus.children('DOCUMENT').count()
status = corpus.status()
if status is not None and not status['complete']:
corpus.status_message = '(in progress: %s, %d complete)' % (
status['action'].replace('_', ' '),
status['progress'],
)
if not status['error']:
corpus.status_message = '(in progress: %s, %d complete)' % (
status['action'].replace('_', ' '),
status['progress'],
)
else:
corpus.status_message = '(aborted: "%s" after %i docs)' % (
status['error'][-1],
status['progress']
)
else:
corpus.status_message = ''
# add
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment