Commit 6260e8c1 authored by Romain Loth's avatar Romain Loth

maplist generation and better estimates for constants (thresholds)

parent 57517450
......@@ -9,9 +9,11 @@ LISTTYPES = {
'STOPLIST' : UnweightedList,
'MAINLIST' : UnweightedList,
'MAPLIST' : UnweightedList,
'SPECIFICITY' : WeightedList,
'OCCURRENCES' : WeightedContextIndex,
'COOCCURRENCES': WeightedMatrix,
'TFIDF-CORPUS' : WeightedContextIndex,
'TFIDF-GLOBAL' : WeightedContextIndex,
}
NODETYPES = [
......@@ -92,10 +94,21 @@ RESOURCETYPES = [
# },
]
# linguistic extraction parameters
DEFAULT_TFIDF_CUTOFF_RATIO = .55 # for MAINLIST maximum terms
DEFAULT_TFIDF_HARD_LIMIT = 1000 # for MAINLIST maximum terms
DEFAULT_COOC_THRESHOLD = 4 # for COOCCURRENCES node
# linguistic extraction parameters ---------------------------------------------
DEFAULT_TFIDF_CUTOFF_RATIO = .45 # MAINLIST maximum terms in %
DEFAULT_TFIDF_HARD_LIMIT = 750 # MAINLIST maximum terms abs
# (makes COOCS larger ~ O(N²) /!\)
DEFAULT_COOC_THRESHOLD = 5 # inclusive minimum for COOCS coefs
# (makes COOCS more sparse)
DEFAULT_MAPLIST_MAX = 300 # MAPLIST maximum terms
DEFAULT_MAPLIST_MONOGRAMS_RATIO = .5 # part of monograms in MAPLIST
# (NB: used to be 0.005 !!)
# ------------------------------------------------------------------------------
# other parameters
# default number of docs POSTed to scrappers.views.py
......
from .parsing import parse
from .ngrams_extraction import extract_ngrams
# in usual run order
from .list_stop import do_stoplist
from .ngram_scores import compute_occurrences_local, compute_tfidf
from .list_main import do_mainlist
from .ngram_coocs_tempo import compute_coocs
from .score_specificity import compute_specificity
from .list_map import compute_mapList # TEST
from .list_map import do_maplist # TEST
from .ngram_groups import compute_groups
from gargantext.util.db import session
......@@ -40,10 +41,19 @@ def parse_extract(corpus):
# -------------------------------
print('CORPUS #%d: [%s] starting ngram lists computation' % (corpus.id, t()))
# -> stoplist: compute + write (=> Node and NodeNgram)
stop_id = compute_stop(corpus)
# -> stoplist: filter + write (to Node and NodeNgram)
stop_id = do_stoplist(corpus)
print('CORPUS #%d: [%s] new stoplist node #%i' % (corpus.id, t(), stop_id))
# -> write groups to Node and NodeNgramNgram
group_id = compute_groups(corpus, stoplist_id = None)
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id = compute_occurrences_local(corpus)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# ------------
# -> write local tfidf to Node and NodeNodeNgram
ltfidf_id = compute_tfidf(corpus, scope="local")
print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id))
......@@ -52,31 +62,27 @@ def parse_extract(corpus):
gtfidf_id = compute_tfidf(corpus, scope="global")
print('CORPUS #%d: [%s] new globaltfidf node #%i' % (corpus.id, t(), gtfidf_id))
# -> mainlist: compute + write (to Node and NodeNgram)
mainlist_id = mainlist_filter(corpus, tfidf_id = gtfidf_id, stoplist_id = stop_id)
# -> mainlist: filter + write (to Node and NodeNgram)
mainlist_id = do_mainlist(corpus,
tfidf_id = gtfidf_id,
stoplist_id = stop_id)
print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
# ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id, stop_id = None)
print('CORPUS #%d: [%s] new cooccs node #%i' % (corpus.id, t(), cooc_id))
cooc_id = compute_coocs(corpus, mainlist_id = mainlist_id)
print('CORPUS #%d: [%s] new coocs node #%i' % (corpus.id, t(), cooc_id))
# ?? specificity: compute + write (=> NodeNodeNgram)
# -> specificity: compute + write (=> NodeNodeNgram)
spec_id = compute_specificity(corpus, cooc_id=cooc_id)
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), cooc_id))
print('CORPUS #%d: [%s] new specificity node #%i' % (corpus.id, t(), spec_id))
# ?? maplist: compute + write (to Node and NodeNgram)
# map_id = compute_stop(corpus)
# print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id = compute_occurrences_local(corpus)
print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id))
# -> write groups to Node and NodeNgramNgram
group_id = compute_groups(corpus, stoplist_id = None)
print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id))
map_id = do_maplist(corpus,
mainlist_id = mainlist_id,
specificity_id=spec_id,
grouplist_id=group_id)
print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id))
def t():
......
......@@ -2,26 +2,38 @@ from gargantext.models import Node, NodeNgram, NodeNodeNgram
from gargantext.util.db import session
from gargantext.util.lists import UnweightedList
from sqlalchemy import desc
from gargantext.constants import DEFAULT_TFIDF_CUTOFF_RATIO, DEFAULT_TFIDF_HARD_LIMIT
from math import floor
from gargantext.constants import DEFAULT_TFIDF_CUTOFF_RATIO, \
DEFAULT_TFIDF_HARD_LIMIT
def do_mainlist(corpus, tfidf_id=None, stoplist_id=None,
def do_mainlist(corpus,
overwrite_id = None,
tfidf_id=None, stoplist_id=None,
hard_limit=DEFAULT_TFIDF_HARD_LIMIT,
ratio_limit=DEFAULT_TFIDF_CUTOFF_RATIO
):
"""
Select terms for the mainlist according to a global tfidf and stoplist.
Select top n terms according to a global tfidf ranking and stoplist filter.
The number of selected terms will be:
min(hard_limit, number_of_terms * ratio_limit)
NB : We use a global tfidf node where the values are global but the ngrams
are already selected (== only within this corpus documents).
TO DISCUSS: allow influence of the local tfidf scores too
Parameters:
2 limits are useful to set a maximum amount of picked terms
- ratio_limit: relative to the number of distinct ngrams [0,1]
- hard_limit: absolute value [default: 1000]
- the corpus itself
- a tfidf score for ranking the ngrams
- a stoplist for filtering some ngrams
- overwrite_id: optional id of a pre-existing MAINLIST node for this corpus
(the Node and its previous NodeNgram rows will be replaced)
+ 2 limits to set the amount of picked terms:
- ratio_limit ∈ [0,1]: a ratio relative to the number of distinct ngrams
(default: 0.55)
- hard_limit: an absolute max value
(default: 1000)
"""
# retrieve helper nodes if not provided
......@@ -61,20 +73,26 @@ def do_mainlist(corpus, tfidf_id=None, stoplist_id=None,
nb_ngrams = ordered_filtered_tfidf.count()
# apply ratio to find smallest limit
our_limit = min(hard_limit, floor(nb_ngrams * ratio_limit))
our_limit = min(hard_limit, round(nb_ngrams * ratio_limit))
print("MAINLIST: keeping %i ngrams out of %i" % (our_limit,nb_ngrams))
# DB retrieve up to limit => MAINLIST
top_ngrams_ids = ordered_filtered_tfidf.limit(our_limit).all()
# now create the new MAINLIST node
mainlist = corpus.add_child(
typename = "MAINLIST",
name = "Mainlist (in:%s)" % corpus.name[0:10]
)
session.add(mainlist)
session.commit()
the_id = mainlist.id
if overwrite_id:
# overwrite pre-existing id
the_id = overwrite_id
# mainlist = cache.Node[overwrite_id]
else:
# now create the new MAINLIST node
mainlist = corpus.add_child(
typename = "MAINLIST",
name = "Mainlist (in:%s)" % corpus.id
)
session.add(mainlist)
session.commit()
the_id = mainlist.id
# create UnweightedList object and save (=> new NodeNgram rows)
UnweightedList(top_ngrams_ids).save(the_id)
......
from gargantext.util.db import *
from gargantext.util.db_cache import *
from gargantext.constants import *
"""
Selects a subset of corpus ngrams to use in the graph map.
"""
from gargantext.models.ngrams import Ngram, NodeNgram,\
NodeNodeNgram, NodeNgramNgram
from gargantext.models.ngrams import Node, Ngram, NodeNgram, \
NodeNgramNgram, NodeNodeNgram
from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache
from gargantext.util.lists import UnweightedList
from sqlalchemy import desc
from gargantext.constants import DEFAULT_MAPLIST_MAX,\
DEFAULT_MAPLIST_MONOGRAMS_RATIO
def do_maplist(corpus,
overwrite_id = None,
mainlist_id = None,
specificity_id = None,
grouplist_id = None,
limit=DEFAULT_MAPLIST_MAX,
monograms_part=DEFAULT_MAPLIST_MONOGRAMS_RATIO
):
'''
According to Specificities and mainlist
from sqlalchemy.sql import func
from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from sqlalchemy import literal_column
from sqlalchemy.orm import aliased
from gargantext.util.toolchain.ngram_tools import insert_ngrams
import csv
Parameters:
- mainlist_id (starting point, already cleaned of stoplist terms)
- specificity_id (ranking factor)
- grouplist_id (filtering grouped ones)
- overwrite_id: optional if preexisting MAPLIST node to overwrite
def compute_mapList(corpus_id,limit=500,n=1, session=None):
'''
According to Specificities and stoplist,
+ 2 constants to modulate the terms choice
- limit for the amount of picked terms
- monograms_part: a ratio of terms with only one lexical unit to keep
'''
monograms_part = 0.005
if not (mainlist_id and specificity_id and grouplist_id):
raise ValueError("Please provide mainlist_id, specificity_id and grouplist_id")
monograms_limit = round(limit * monograms_part)
multigrams_limit = limit - monograms_limit
print("MAPLIST: monograms_limit =", monograms_limit)
print("MAPLIST: multigrams_limit = ", multigrams_limit)
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
list_main_id = session.query(Node.id).filter(
Node.typename == "MAINLIST",
Node.parent_id == corpus_id).first()
list_stop_id = session.query(Node.id).filter(
Node.typename == "STOPLIST",
Node.parent_id == corpus_id).first()
list_group_id = session.query(Node.id).filter(
Node.typename == "GROUPLIST",
Node.parent_id == corpus_id).first()
score_spec_id = session.query(Node.id).filter(
Node.typename == "SPECIFICITY",
Node.parent_id == corpus_id).first()
ListMain=aliased(NodeNgram)
ListStop=aliased(NodeNgram)
ListGroup=aliased(NodeNgramNgram)
ScoreSpec=aliased(NodeNodeNgram)
# FIXME outerjoin does not work with current SqlAlchemy
# lines below the query do the job but it can be improved
query = (session.query(ScoreSpec.ngram_id, ScoreSpec.score)
.join(ListMain, ScoreSpec.ngram_id == ListMain.ngram_id)
mainterms_subquery = (session
# we want only terms within mainlist
.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == mainlist_id)
.subquery()
)
primary_groupterms_subquery = (session
# we want only primary terms (ngram1)
.query(NodeNgramNgram.ngram1_id)
.filter(NodeNgramNgram.node_id == grouplist_id)
.subquery()
)
ScoreSpec=aliased(NodeNgram)
# specificity-ranked
query = (session.query(ScoreSpec.ngram_id)
.join(Ngram, Ngram.id == ScoreSpec.ngram_id)
#.outerjoin(ListGroup, Group.ngramy_id == ScoreSpec.ngram_id)
#.outerjoin(ListStop, Stop.ngram_id == ScoreSpec.ngram_id)
.filter(ListMain.node_id == list_main_id)
#.filter(ListGroup.node_id == list_group_id)
#.filter(ListStop.node_id == list_stop_id)
.filter(ScoreSpec.nodex_id == score_spec_id)
.filter(ScoreSpec.node_id == specificity_id)
.filter(ScoreSpec.ngram_id.in_(mainterms_subquery))
.filter(ScoreSpec.ngram_id.in_(primary_groupterms_subquery))
)
top_monograms = (query
.filter(Ngram.n == 1)
.order_by(desc(ScoreSpec.score))
.order_by(desc(ScoreSpec.weight))
.limit(monograms_limit)
.all()
)
top_multigrams = (query
.filter(Ngram.n >= 2)
.order_by(desc(ScoreSpec.score))
.order_by(desc(ScoreSpec.weight))
.limit(multigrams_limit)
.all()
)
stop_ngrams = (session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == list_stop_id)
.all()
)
grouped_ngrams = (session.query(NodeNgramNgram.ngramy_id)
.filter(NodeNgramNgram.node_id == list_group_id)
.all()
print("MAPLIST: top_monograms =", len(top_monograms))
print("MAPLIST: top_multigrams = ", len(top_multigrams))
# NEW MAPLIST NODE
# -----------------
# saving the parameters of the analysis in the Node JSON
new_hyperdata = { 'corpus': corpus.id,
'limit' : limit,
'monograms_part' : monograms_part
}
if overwrite_id:
# overwrite pre-existing node
the_maplist = cache.Node[overwrite_id]
the_maplist.hyperdata = new_hyperdata
the_maplist.save_hyperdata()
session.commit()
the_id = overwrite_id
else:
# create a new maplist node
the_maplist = corpus.add_child(
name="Maplist (in %i)" % corpus.id,
typename="MAPLIST",
hyperdata = new_hyperdata
)
list_map_id = session.query(Node.id).filter(
Node.parent_id==corpus_id,
Node.typename == "MAPLIST"
).first()
if list_map_id == None:
corpus = cache.Node[corpus_id]
user_id = corpus.user_id
list_map = Node(name="MAPLIST", parent_id=corpus_id, user_id=user_id, typename="MAPLIST")
session.add(list_map)
session.add(the_maplist)
session.commit()
list_map_id = list_map.id
session.query(NodeNgram).filter(NodeNgram.node_id==list_map_id).delete()
session.commit()
data = zip(
[list_map_id for i in range(1,limit)]
, [n[0] for n in list(top_multigrams) + list(top_monograms)
if (n[0],) not in list(stop_ngrams)
]
, [1 for i in range(1,limit)]
)
#print([d for d in data])
bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
dbg.show('MapList computed')
the_id = the_maplist.id
# create UnweightedList object and save (=> new NodeNgram rows)
datalist = UnweightedList(
[res.ngram_id for res in top_monograms + top_multigrams]
)
# save
datalist.save(the_id)
# dbg.show('MapList computed')
return the_id
from gargantext.util.db import *
from gargantext.util.db_cache import *
from gargantext.constants import *
from gargantext.util.db import session, aliased, func
from gargantext.util.lists import WeightedMatrix
"""
Creates a filtering list for corpus ngrams.
(implementation: regexp + "master" stoplist)
"""
from gargantext.models import User, Node, Ngram, NodeNgram
from gargantext.util.db import session, func
from gargantext.constants import LISTTYPES
from re import compile
from sqlalchemy import desc
import re
from sqlalchemy import desc, asc
#from ngram.tools import insert_ngrams
def isStopWord(ngram, stop_words=None):
def is_stop_word(ngram, stop_words=None):
'''
ngram :: (Int, String) => (ngram_id, ngram_terms)
stop_words :: Set of String
(to avoid SQL query each time isStopWord is invoked, get in as parameter)
(to avoid SQL query each time is_stop_word is invoked, get in as parameter)
'''
word = ngram[1]
......@@ -41,7 +39,7 @@ def isStopWord(ngram, stop_words=None):
, "(.*)(travers)(.*)"
, "(.*)(:|\|)(.*)"
] :
compiled_regexes.append(re.compile(regex))
compiled_regexes.append(compile(regex))
for format_regex in compiled_regexes:
if format_regex.match(word):
......@@ -61,32 +59,27 @@ def create_gargantua_resources():
session.add(stopList)
session.commit()
def compute_stop(corpus, stopList_id=None, debug=False):
def do_stoplist(corpus, overwrite_id=None):
'''
Create list of stop words.
TODO do a function to get all stop words with social scores
Parameters:
- overwrite_id: optional preexisting STOPLIST node to overwrite
'''
# Get the StopList if it exist or create a new one
# Get preexisting StopList if provided in overwrite_id param
if overwrite_id:
stoplist_id = overwrite_id
# At this step of development, a new StopList should be created
if stopList_id == None:
stopList_id = session.query(Node.id).filter(
Node.parent_id==corpus.id,
Node.typename == "STOPLIST"
).first()
if stopList_id == None:
stopList = Node(name="STOPLIST",
parent_id=corpus.id,
user_id=corpus.user_id,
typename="STOPLIST")
session.add(stopList)
session.commit()
stopList_id = stopList.id
# For tests only
if debug == True:
session.query(Node).filter(Node.id==stopList_id).delete()
else:
stoplist = corpus.add_child(
name="Stoplist (in:%s)" % corpus.id,
typename="STOPLIST"
)
session.add(stoplist)
session.commit()
stoplist_id = stoplist.id
# Get common resources, all common StopWords on the platform
## First get the id of the StopList of Gargantua super user
......@@ -107,23 +100,23 @@ def compute_stop(corpus, stopList_id=None, debug=False):
## Get the ngrams
## ngrams :: [(Int, String, Int)]
frequency = func.count( NodeNgram.weight )
ngrams = (session.query( Ngram.id, Ngram.terms, frequency )
ngrams = (session.query( Ngram.id, Ngram.terms)
.join( NodeNgram, NodeNgram.ngram_id == Ngram.id )
.join( Node, Node.id == NodeNgram.node_id )
.filter( Node.parent_id == corpus.id,
Node.typename == "DOCUMENT")
.group_by( Ngram.id )
.order_by( desc( frequency ) )
#.limit(limit)
.all()
)
ngrams_to_stop = filter(lambda x: isStopWord(x,stop_words=stop_words), ngrams)
ngrams_to_stop = filter(
lambda x: is_stop_word(x,stop_words=stop_words), ngrams
)
# print([n for n in ngrams_to_stop])
stop = LISTTYPES["STOPLIST"]({ n[0] : -1 for n in ngrams_to_stop})
# stop = LISTTYPES["STOPLIST"]([n[0] for n in ngrams_to_stop])
stop.save(stopList_id)
return stopList_id
stop.save(stoplist_id)
return stoplist_id
from gargantext.models import Node, NodeNgram, NodeNgramNgram
from gargantext.util.lists import WeightedMatrix
from gargantext.util.db import session, aliased, func
from gargantext.constants import DEFAULT_COOC_THRESHOLD
from gargantext.models import Node, NodeNgram, NodeNgramNgram
from gargantext.util.lists import WeightedMatrix
from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD
def compute_coocs(corpus,
threshold = DEFAULT_COOC_THRESHOLD,
weighted = False,
our_id = None,
stop_id = None,
overwrite_id = None,
threshold = DEFAULT_COOC_THRESHOLD,
mainlist_id = None,
stoplist_id = None,
symmetry_filter = True):
"""
Count how often some extracted terms appear
together in a small context (document)
throughout a larger context (corpus).
node_id | ngram_id | weight ngram1_id | ngram2_id | ucooc | wcooc |
--------+----------+-------- ----------+-----------+-------+-------+
MYDOC | 487 | 1 => 487 | 294 | 1 | 4 |
MYDOC | 294 | 3
[NodeNgram] [NodeNgramNgram]
node_id | ngram_id | weight ngram1_id | ngram2_id | score |
--------+----------+-------- ----------+-----------+-------+
MYDOCA | 487 | 1 => 487 | 294 | 2 |
MYDOCA | 294 | 3
MYDOCB | 487 | 1
MYDOCB | 294 | 4
Fill that info in DB:
- a *new* COOCCURRENCES node
......@@ -25,14 +30,16 @@ def compute_coocs(corpus,
worse case complexity ~ O(N²/2) with N = number of ngrams
If a mainlist is provided, we filter doc ngrams to those also in the list.
Parameters:
- threshold: on output ucooc count (previously called hapax)
- weighted: if False normal cooc to be saved as result
if True weighted cooc (experimental)
- stop_id: stoplist for filtering input ngrams
- TODO cvalue_id: allow a metric as input filter
- TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
- TODO start, end : filter on document date
- the corpus node
- overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
(all hyperdata and previous NodeNgramNgram rows will be replaced)
- threshold: on output cooc count (previously called hapax)
- mainlist_id: mainlist to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is provided)
(deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
......@@ -54,14 +61,17 @@ def compute_coocs(corpus,
coocs for each doc :
- each given pair like (termA, termB) will likely appear several times
=> we do GROUP BY (x1.ngram_id, x2.ngram_id)
- normally we can count unique appearances of the pair (ucooc)
- we can count sum of sum of weights in the pair (wcooc or cofreq)
- we count unique appearances of the pair (cooc)
TODO
====
use WeightedMatrix
"""
# - TODO cvalue_id: allow a metric as additional input filter
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO start, end : filter on document date
# - TODO weighted: if False normal cooc to be saved as result
# if True weighted cooc (experimental)
# /!\ big combinatorial complexity /!\
# pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1
# 1.859.408 lignes pour la requête cooc simple
......@@ -94,10 +104,22 @@ def compute_coocs(corpus,
# 2) INPUT FILTERS (reduce N before O(N²))
# £TODO add possibility to restrict to the mainlist
if stop_id:
if mainlist_id:
main_subquery = (
session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == mainlist_id)
.subquery()
)
coocs_query = ( coocs_query
.filter( x1.ngram_id.in_(main_subquery) )
.filter( x2.ngram_id.in_(main_subquery) )
)
if stoplist_id:
stop_subquery = (
session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == stop_id)
.filter(NodeNgram.node_id == stoplist_id)
.subquery()
)
......@@ -128,30 +150,36 @@ def compute_coocs(corpus,
# 3) OUTPUT FILTERS
# ------------------
# threshold
#
coocs_query = coocs_query.having(ucooc > threshold)
coocs_query = coocs_query.having(ucooc >= threshold)
# 4) EXECUTE QUERY
# ----------------
# => storage in our matrix structure
matrix = WeightedMatrix(coocs_query.all())
# fyi
# shape_0 = len({pair[0] for pair in matrix.items})
# shape_1 = len({pair[1] for pair in matrix.items})
# print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
# 5) SAVE
# --------
if our_id:
# use pre-existing id
the_id = our_id
# saving the parameters of the analysis in the Node JSON
new_hyperdata = { 'corpus': corpus.id,
'threshold': threshold }
if overwrite_id:
# overwrite pre-existing id
the_cooc = cache.Node[overwrite_id]
the_cooc.hyperdata = new_hyperdata
the_cooc.save_hyperdata()
session.commit()
the_id = overwrite_id
else:
# create the new cooc node
the_cooc = Node(
the_cooc = corpus.add_child(
typename = "COOCCURRENCES",
name = "Coocs (in:%s)" % corpus.name[0:10],
parent_id = corpus.id,
user_id = corpus.user_id,
# saving the parameters of the analysis in the Node JSON
hyperdata = { 'corpus': corpus.id,
'threshold': threshold }
hyperdata = new_hyperdata,
)
session.add(the_cooc)
session.commit()
......
from gargantext.models import Node, NodeNgramNgram
from gargantext.util.db import session
from gargantext.util.lists import Translations
"""
For initial ngram groups via stemming
Exemple:
- groups['copper engrav'] = {'copper engraving':3, 'coppers engraver':1...}
- groups['post'] = {'poste':3, 'poster':5, 'postés':2...}
"""
from gargantext.models import Node, NodeNgramNgram
from gargantext.util.db import session
from gargantext.util.lists import Translations
# to convert fr => french :/
from gargantext.util.languages import languages
from nltk.stem.snowball import SnowballStemmer
from re import split as resplit
from collections import defaultdict, Counter
from re import