Commit 1cdf6c86 authored by delanoe's avatar delanoe

Merge branch 'romain-sqlcoocs' of ssh://delanoe.org:1979/gargantext into testing

parents c1c0462e 5cbe3dc5
from gargantext.models import Node, NodeNgram, NodeNgramNgram, \ """
NodeHyperdata, Ngram COOCS
(this is the full SQL version, should be more reliable on outerjoin)
"""
from gargantext import settings
from sqlalchemy import create_engine
from gargantext.util.lists import WeightedMatrix from gargantext.util.lists import WeightedMatrix
from gargantext.util.db import session, aliased, func # from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD from gargantext.constants import DEFAULT_COOC_THRESHOLD
from datetime import datetime from gargantext.constants import INDEXED_HYPERDATA
from gargantext.util.tools import datetime, convert_to_date
from sqlalchemy.sql.expression import case # for choice if ngram has mainform or not
def compute_coocs( corpus, def compute_coocs( corpus,
overwrite_id = None, overwrite_id = None,
...@@ -58,190 +61,49 @@ def compute_coocs( corpus, ...@@ -58,190 +61,49 @@ def compute_coocs( corpus,
this convention: "2001-01-01" aka "%Y-%m-%d") this convention: "2001-01-01" aka "%Y-%m-%d")
- symmetry_filter: prevent calculating where ngram1_id > ngram2_id - symmetry_filter: prevent calculating where ngram1_id > ngram2_id
- diagonal_filter: prevent calculating where ngram1_id == ngram2_id - diagonal_filter: prevent calculating where ngram1_id == ngram2_id
(deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
- isMonopartite: ?? used a nodes_hyperdata_ngrams table ???
basic idea for one doc
======================
each pair of ngrams sharing same doc (node_id)
SELEC idxa.ngram_id, idxb.ngram_id
FROM nodes_ngrams AS idxa
---------------------------------
JOIN nodes_ngrams AS idxb
ON idxa.node_id = idxb.node_id <== that's cooc
---------------------------------
AND idxa.ngram_id <> idxb.ngram_id (diagonal_filter)
AND idxa.node_id = MY_DOC ;
on entire corpus
=================
coocs for each doc :
- each given pair like (termA, termB) will likely appear several times
=> we do GROUP BY (Xindex.ngram_id, Yindex.ngram_id)
- we count unique appearances of the pair (cooc)
""" """
# - TODO cvalue_id: allow a metric as additional input filter # 1) prepare direct connection to the DB
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram) url = 'postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{NAME}'.format(
# - TODO weighted: if False normal cooc to be saved as result **settings.DATABASES['default']
# if True weighted cooc (experimental)
# /!\ big combinatorial complexity /!\
# pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1
# 1.859.408 lignes pour la requête cooc simple
# 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)
# 2 x the occurrence index table
Xindex = aliased(NodeNgram)
Yindex = aliased(NodeNgram)
# for debug (1/4)
# Xngram = aliased(Ngram)
# Yngram = aliased(Ngram)
# 1) prepare definition of counted forms
if not groupings_id:
# no groupings => the counted forms are the ngrams
Xindex_ngform_id = Xindex.ngram_id
Yindex_ngform_id = Yindex.ngram_id
# groupings: cf commentaire détaillé dans compute_occs() + todo facto
else:
# prepare translations
Xsyno = (session.query(NodeNgramNgram.ngram1_id,
NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == groupings_id)
.subquery()
)
# further use as anon tables prevent doing Ysyno = Xsyno
Ysyno = (session.query(NodeNgramNgram.ngram1_id,
NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == groupings_id)
.subquery()
)
# groupings => define the counted form depending on the existence of a synonym
Xindex_ngform_id = case([
(Xsyno.c.ngram1_id != None, Xsyno.c.ngram1_id),
(Xsyno.c.ngram1_id == None, Xindex.ngram_id)
# condition value
])
Yindex_ngform_id = case([
(Ysyno.c.ngram1_id != None, Ysyno.c.ngram1_id),
(Ysyno.c.ngram1_id == None, Yindex.ngram_id)
])
# ---
# 2) BASE DB QUERY
# cooccurrences columns definition ----------------
ucooc = func.count(Xindex_ngform_id).label("ucooc")
# NB could be X or Y in this line
# (we're counting grouped rows and just happen to do it on this column)
base_query = (
session.query(
Xindex_ngform_id,
Yindex_ngform_id,
ucooc
# for debug (2/4)
# , Xngram.terms.label("w_x")
# , Yngram.terms.label("w_y")
)
.join(Yindex, Xindex.node_id == Yindex.node_id ) # <- by definition of cooc
.join(Node, Node.id == Xindex.node_id) # <- b/c within corpus
.filter(Node.parent_id == corpus.id) # <- b/c within corpus
.filter(Node.typename == "DOCUMENT") # <- b/c within corpus
)
# outerjoin the synonyms if needed
if groupings_id:
base_query = (base_query
.outerjoin(Xsyno, # <- synonyms for Xindex.ngrams
Xsyno.c.ngram2_id == Xindex.ngram_id)
.outerjoin(Ysyno, # <- synonyms for Yindex.ngrams
Ysyno.c.ngram2_id == Yindex.ngram_id)
)
# 3) counting clause in any case
coocs_query = (base_query
.group_by(
Xindex_ngform_id, Yindex_ngform_id # <- what we're counting
# for debug (3/4)
# ,"w_x", "w_y"
)
# for debug (4/4)
# .join(Xngram, Xngram.id == Xindex_ngform_id)
# .join(Yngram, Yngram.id == Yindex_ngform_id)
.order_by(ucooc)
) )
engine = create_engine( url )
connection = engine.connect()
# 4) INPUT FILTERS (reduce N before O(N²)) # string vars for our SQL query
if on_list_id: sql_statement = ""
# £TODO listes différentes ou bien une liste pour x et tous les ngrammes pour y doc_idx_statement = ""
# car permettrait expansion de liste aux plus proches voisins (MacLachlan)
# (avec une matr rectangulaire)
m1 = aliased(NodeNgram) # 2a) prepare the document selection (normal case)
m2 = aliased(NodeNgram) doc_idx_statement = """
SELECT node_id, ngram_id
FROM nodes_ngrams
JOIN nodes
ON node_id = nodes.id
WHERE nodes.parent_id = {corpus_id}
AND nodes.typename = 4
""".format(corpus_id=corpus.id)
coocs_query = ( coocs_query
.join(m1, m1.ngram_id == Xindex_ngform_id)
.join(m2, m2.ngram_id == Yindex_ngform_id)
.filter( m1.node_id == on_list_id )
.filter( m2.node_id == on_list_id )
)
if stoplist_id:
s1 = (session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == stoplist_id)
.subquery()
)
# further use as anon tables prevent doing s2 = s1
s2 = (session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == stoplist_id)
.subquery()
)
coocs_query = ( coocs_query
.outerjoin(s1, s1.c.ngram_id == Xindex_ngform_id)
.outerjoin(s2, s2.c.ngram_id == Yindex_ngform_id)
# équivalent NOT IN stoplist
.filter( s1.c.ngram_id == None )
.filter( s2.c.ngram_id == None )
)
if diagonal_filter:
# don't compute ngram with itself
coocs_query = coocs_query.filter(Xindex_ngform_id != Yindex_ngform_id)
# 2b) same if document filters
if start or end: if start or end:
Time = aliased(NodeHyperdata) date_type_id = INDEXED_HYPERDATA['publication_date']['id']
coocs_query = (coocs_query doc_idx_statement = """
.join(Time, Time.node_id == Xindex.node_id) SELECT node_id, ngram_id
.filter(Time.key=="publication_date") FROM nodes_ngrams
) JOIN nodes
ON node_id = nodes.id
-- preparing for date filter (1/2)
JOIN nodes_hyperdata
ON nodes_hyperdata.node_id = nodes_ngrams.node_id
WHERE nodes.parent_id = {corpus_id}
AND nodes.typename = 4
-- preparing for date filter (2/2)
AND nodes_hyperdata.key = {date_type_id}
""".format(corpus_id=corpus.id, date_type_id = date_type_id)
if start: if start:
if not isinstance(start, datetime): if not isinstance(start, datetime):
...@@ -250,8 +112,11 @@ def compute_coocs( corpus, ...@@ -250,8 +112,11 @@ def compute_coocs( corpus,
except: except:
raise TypeError("'start' param expects datetime object or %%Y-%%m-%%d string") raise TypeError("'start' param expects datetime object or %%Y-%%m-%%d string")
# datetime object ~> date db formatted filter (2013-09-16 00:00:00+02)
start_filter = "AND nodes_hyperdata.value_utc >= %s::date" % start.strftime('%Y-%m-%d %H:%M:%S%z')
# the filtering by start limit # the filtering by start limit
coocs_query = coocs_query.filter(Time.value_utc >= start) doc_idx_statement += "\n" + start_filter
if end: if end:
if not isinstance(end, datetime): if not isinstance(end, datetime):
...@@ -260,31 +125,133 @@ def compute_coocs( corpus, ...@@ -260,31 +125,133 @@ def compute_coocs( corpus,
except: except:
raise TypeError("'end' param expects datetime object or %%Y-%%m-%%d string") raise TypeError("'end' param expects datetime object or %%Y-%%m-%%d string")
# the filtering by start limit # datetime object ~> date db formatted filter
coocs_query = coocs_query.filter(Time.value_utc <= end) end_filter = "AND nodes_hyperdata.value_utc <= %s::date" % end.strftime('%Y-%m-%d %H:%M:%S%z')
# the filtering by end limit
doc_idx_statement += "\n" + end_filter
if symmetry_filter: # 4) prepare the synonyms
# 1 filtre tenant en compte de la symétrie if groupings_id:
# -> réduit le travail de moitié !! syn_statement = """
# -> mais récupération sera plus couteuse via des requêtes OR comme: SELECT * FROM nodes_ngrams_ngrams
# WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram WHERE node_id = {groupings_id}
coocs_query = coocs_query.filter(Xindex_ngform_id < Yindex_ngform_id) """.format(groupings_id = groupings_id)
# 5) OUTPUT FILTERS # 5a) MAIN DB QUERY SKELETON (no groupings) --------------------------------
# ------------------ if not groupings_id:
# threshold sql_statement = """
# £TODO adjust COOC_THRESHOLD a posteriori: SELECT cooc.*
# ex: sometimes 2 sometimes 4 depending on sparsity FROM (
print("COOCS: filtering pairs under threshold:", threshold) SELECT idxA.ngram_id AS ngA,
coocs_query = coocs_query.having(ucooc >= threshold) idxB.ngram_id AS ngB,
count((idxA.ngram_id,
idxB.ngram_id)) AS cwei
-- read doc index x 2
FROM ({doc_idx}) AS idxA
JOIN ({doc_idx}) AS idxB
-- cooc <=> in same doc node
ON idxA.node_id = idxB.node_id
GROUP BY ((idxA.ngram_id,idxB.ngram_id))
) AS cooc
""".format(doc_idx = doc_idx_statement)
# --------------------------------------------------------------------------
# 5b) MAIN DB QUERY SKELETON (with groupings)
# groupings: we use additional Translation (synonyms) for ngA and ngB
else:
sql_statement = """
SELECT cooc.*
FROM (
SELECT COALESCE(synA.ngram1_id, idxA.ngram_id) AS ngA,
COALESCE(synB.ngram1_id, idxB.ngram_id) AS ngB,
count((COALESCE(synA.ngram1_id, idxA.ngram_id),
COALESCE(synB.ngram1_id, idxB.ngram_id))) AS cwei
-- read doc index x 2
FROM ({doc_idx}) AS idxA
JOIN ({doc_idx}) AS idxB
-- cooc <=> in same doc node
ON idxA.node_id = idxB.node_id
-- when idxA.ngram_id is a subform
LEFT JOIN ({synonyms}) as synA
ON synA.ngram2_id = idxA.ngram_id
-- when idxB.ngram_id is a subform
LEFT JOIN ({synonyms}) as synB
ON synB.ngram2_id = idxB.ngram_id
GROUP BY (COALESCE(synA.ngram1_id, idxA.ngram_id),
COALESCE(synB.ngram1_id, idxB.ngram_id))
) AS cooc
""".format(doc_idx = doc_idx_statement,
synonyms = syn_statement)
# 6) prepare 2 x node_ngrams alias if whitelist
if on_list_id:
sql_statement +="""
JOIN nodes_ngrams AS whitelistA
ON whitelistA.ngram_id = cooc.ngA
JOIN nodes_ngrams AS whitelistB
ON whitelistB.ngram_id = cooc.ngB
"""
if stoplist_id:
# used for reverse join
sql_statement +="""
LEFT JOIN (
SELECT * FROM nodes_ngrams
WHERE nodes_ngrams.node_id = %i
) AS stoplistA
ON stoplistA.ngram_id = cooc.ngA
LEFT JOIN (
SELECT * FROM nodes_ngrams
WHERE nodes_ngrams.node_id = %i
) AS stoplistB
ON stoplistA.ngram_id = cooc.ngA
""" % (stoplist_id, stoplist_id)
# 7) FILTERS
# the inclusive threshold filter is always here
sql_statement += "\n WHERE cooc.cwei >= %i" % threshold
# the optional whitelist perimeters
if on_list_id:
sql_statement += "\n AND whitelistA.node_id = %i" % on_list_id
sql_statement += "\n AND whitelistB.node_id = %i" % on_list_id
if stoplist_id:
sql_statement += "\n AND stoplistA.ngram_id IS NULL"
sql_statement += "\n AND stoplistB.ngram_id IS NULL"
# don't compute ngram with itself
# NB: this option is bad for main toolchain
if diagonal_filter:
sql_statement += "\n AND ngA != ngB"
# 1 filtre tenant en compte de la symétrie
# NB: this option is also bad for main toolchain
if symmetry_filter:
sql_statement += "\n AND ngA <= ngB"
# 6) EXECUTE QUERY # 6) EXECUTE QUERY
# ---------------- # ----------------
# debug
print(sql_statement)
# executing the SQL statement
results = connection.execute(sql_statement)
# => storage in our matrix structure # => storage in our matrix structure
matrix = WeightedMatrix(coocs_query.all()) matrix = WeightedMatrix(results)
# ------------------- # -------------------
# fyi # fyi
......
"""
COOCS
(this is the full sqlalchemy version, from "refactored" toolchain march-may 2016)
"""
from gargantext.models import Node, NodeNgram, NodeNgramNgram, \
NodeHyperdata, Ngram
from gargantext.util.lists import WeightedMatrix
from gargantext.util.db import session, aliased, func
from gargantext.util.db_cache import cache
from gargantext.constants import DEFAULT_COOC_THRESHOLD
from datetime import datetime
from sqlalchemy.sql.expression import case # for choice if ngram has mainform or not
def compute_coocs( corpus,
overwrite_id = None,
just_pass_result= True, # just return the WeightedMatrix,
# (don't write to DB)
threshold = DEFAULT_COOC_THRESHOLD,
groupings_id = None,
on_list_id = None,
stoplist_id = None,
start = None,
end = None,
symmetry_filter = False,
diagonal_filter = True):
"""
Count how often some extracted terms appear
together in a small context (document)
throughout a larger context (corpus).
[NodeNgram] [NodeNgramNgram]
node_id | ngram_id | weight ngram1_id | ngram2_id | score |
--------+----------+-------- ----------+-----------+-------+
MyDocA | 487 | 1 => 487 | 294 | 2 |
MyDocA | 294 | 3
MyDocB | 487 | 1
MyDocB | 294 | 4
Fill that info in DB:
- a *new* COOCCURRENCES node
- and all corresponding NodeNgramNgram rows
worse case complexity ~ O(N²/2) with N = number of ngrams
If a mainlist is provided, we filter doc ngrams to those also in the list.
Parameters:
- the corpus node
- overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
(all hyperdata and previous NodeNgramNgram rows will be replaced)
- threshold: on output cooc count (previously called hapax)
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
- on_list_id: mainlist or maplist type, to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is already provided)
- start, end: provide one or both temporal limits to filter on doc date
NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow
this convention: "2001-01-01" aka "%Y-%m-%d")
- symmetry_filter: prevent calculating where ngram1_id > ngram2_id
- diagonal_filter: prevent calculating where ngram1_id == ngram2_id
(deprecated parameters)
- field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
- isMonopartite: ?? used a nodes_hyperdata_ngrams table ???
basic idea for one doc
======================
each pair of ngrams sharing same doc (node_id)
SELEC idxa.ngram_id, idxb.ngram_id
FROM nodes_ngrams AS idxa
---------------------------------
JOIN nodes_ngrams AS idxb
ON idxa.node_id = idxb.node_id <== that's cooc
---------------------------------
AND idxa.ngram_id <> idxb.ngram_id (diagonal_filter)
AND idxa.node_id = MY_DOC ;
on entire corpus
=================
coocs for each doc :
- each given pair like (termA, termB) will likely appear several times
=> we do GROUP BY (Xindex.ngram_id, Yindex.ngram_id)
- we count unique appearances of the pair (cooc)
"""
# - TODO cvalue_id: allow a metric as additional input filter
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO weighted: if False normal cooc to be saved as result
# if True weighted cooc (experimental)
# /!\ big combinatorial complexity /!\
# pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1
# 1.859.408 lignes pour la requête cooc simple
# 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)
# 2 x the occurrence index table
Xindex = aliased(NodeNgram)
Yindex = aliased(NodeNgram)
# for debug (1/4)
# Xngram = aliased(Ngram)
# Yngram = aliased(Ngram)
# 1) prepare definition of counted forms
if not groupings_id:
# no groupings => the counted forms are the ngrams
Xindex_ngform_id = Xindex.ngram_id
Yindex_ngform_id = Yindex.ngram_id
# groupings: cf commentaire détaillé dans compute_occs() + todo facto
else:
# prepare translations
Xsyno = (session.query(NodeNgramNgram.ngram1_id,
NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == groupings_id)
.subquery()
)
# further use as anon tables prevent doing Ysyno = Xsyno
Ysyno = (session.query(NodeNgramNgram.ngram1_id,
NodeNgramNgram.ngram2_id)
.filter(NodeNgramNgram.node_id == groupings_id)
.subquery()
)
# groupings => define the counted form depending on the existence of a synonym
Xindex_ngform_id = case([
(Xsyno.c.ngram1_id != None, Xsyno.c.ngram1_id),
(Xsyno.c.ngram1_id == None, Xindex.ngram_id)
# condition value
])
Yindex_ngform_id = case([
(Ysyno.c.ngram1_id != None, Ysyno.c.ngram1_id),
(Ysyno.c.ngram1_id == None, Yindex.ngram_id)
])
# ---
# 2) BASE DB QUERY
# cooccurrences columns definition ----------------
ucooc = func.count(Xindex_ngform_id).label("ucooc")
# NB could be X or Y in this line
# (we're counting grouped rows and just happen to do it on this column)
base_query = (
session.query(
Xindex_ngform_id,
Yindex_ngform_id,
ucooc
# for debug (2/4)
# , Xngram.terms.label("w_x")
# , Yngram.terms.label("w_y")
)
.join(Yindex, Xindex.node_id == Yindex.node_id ) # <- by definition of cooc
.join(Node, Node.id == Xindex.node_id) # <- b/c within corpus
.filter(Node.parent_id == corpus.id) # <- b/c within corpus
.filter(Node.typename == "DOCUMENT") # <- b/c within corpus
)
# outerjoin the synonyms if needed
if groupings_id:
base_query = (base_query
.outerjoin(Xsyno, # <- synonyms for Xindex.ngrams
Xsyno.c.ngram2_id == Xindex.ngram_id)
.outerjoin(Ysyno, # <- synonyms for Yindex.ngrams
Ysyno.c.ngram2_id == Yindex.ngram_id)
)
# 3) counting clause in any case
coocs_query = (base_query
.group_by(
Xindex_ngform_id, Yindex_ngform_id # <- what we're counting
# for debug (3/4)
# ,"w_x", "w_y"
)
# for debug (4/4)
# .join(Xngram, Xngram.id == Xindex_ngform_id)
# .join(Yngram, Yngram.id == Yindex_ngform_id)
.order_by(ucooc)
)
# 4) INPUT FILTERS (reduce N before O(N²))
if on_list_id:
# £TODO listes différentes ou bien une liste pour x et tous les ngrammes pour y
# car permettrait expansion de liste aux plus proches voisins (MacLachlan)
# (avec une matr rectangulaire)
m1 = aliased(NodeNgram)
m2 = aliased(NodeNgram)
coocs_query = ( coocs_query
.join(m1, m1.ngram_id == Xindex_ngform_id)
.join(m2, m2.ngram_id == Yindex_ngform_id)
.filter( m1.node_id == on_list_id )
.filter( m2.node_id == on_list_id )
)
if stoplist_id:
s1 = (session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == stoplist_id)
.subquery()
)
# further use as anon tables prevent doing s2 = s1
s2 = (session.query(NodeNgram.ngram_id)
.filter(NodeNgram.node_id == stoplist_id)
.subquery()
)
coocs_query = ( coocs_query
.outerjoin(s1, s1.c.ngram_id == Xindex_ngform_id)
.outerjoin(s2, s2.c.ngram_id == Yindex_ngform_id)
# équivalent NOT IN stoplist
.filter( s1.c.ngram_id == None )
.filter( s2.c.ngram_id == None )
)
if diagonal_filter:
# don't compute ngram with itself
coocs_query = coocs_query.filter(Xindex_ngform_id != Yindex_ngform_id)
if start or end:
Time = aliased(NodeHyperdata)
coocs_query = (coocs_query
.join(Time, Time.node_id == Xindex.node_id)
.filter(Time.key=="publication_date")
)
if start:
if not isinstance(start, datetime):
try:
start = datetime.strptime(start, '%Y-%m-%d')
except:
raise TypeError("'start' param expects datetime object or %%Y-%%m-%%d string")
# the filtering by start limit
coocs_query = coocs_query.filter(Time.value_utc >= start)
if end:
if not isinstance(end, datetime):
try:
end = datetime.strptime(end, '%Y-%m-%d')
except:
raise TypeError("'end' param expects datetime object or %%Y-%%m-%%d string")
# the filtering by start limit
coocs_query = coocs_query.filter(Time.value_utc <= end)
if symmetry_filter:
# 1 filtre tenant en compte de la symétrie
# -> réduit le travail de moitié !!
# -> mais récupération sera plus couteuse via des requêtes OR comme:
# WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram
coocs_query = coocs_query.filter(Xindex_ngform_id < Yindex_ngform_id)
# 5) OUTPUT FILTERS
# ------------------
# threshold
# £TODO adjust COOC_THRESHOLD a posteriori:
# ex: sometimes 2 sometimes 4 depending on sparsity
print("COOCS: filtering pairs under threshold:", threshold)
coocs_query = coocs_query.having(ucooc >= threshold)
# 6) EXECUTE QUERY
# ----------------
# => storage in our matrix structure
matrix = WeightedMatrix(coocs_query.all())
# -------------------
# fyi
shape_0 = len({pair[0] for pair in matrix.items})
shape_1 = len({pair[1] for pair in matrix.items})
print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
if just_pass_result:
return matrix
else:
# 5) SAVE
# --------
# saving the parameters of the analysis in the Node JSON
new_hyperdata = { 'corpus' : corpus.id,
'threshold': threshold }
if overwrite_id:
# overwrite pre-existing id
the_cooc = cache.Node[overwrite_id]
the_cooc.hyperdata = new_hyperdata
the_cooc.save_hyperdata()
session.commit()
the_id = overwrite_id
else:
# create the new cooc node
the_cooc = corpus.add_child(
typename = "COOCCURRENCES",
name = "Coocs (in:%s)" % corpus.name[0:10],
hyperdata = new_hyperdata,
)
session.add(the_cooc)
session.commit()
the_id = the_cooc.id
# ==> save all NodeNgramNgram with link to new cooc node id
matrix.save(the_id)
return the_id
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment