Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
3b2d568c
Commit
3b2d568c
authored
May 20, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add groups to ngram_coocs + fix date params + fix stoplist param + remove sql IN operators there
parent
92d5dfcd
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
177 additions
and
97 deletions
+177
-97
__init__.py
gargantext/util/toolchain/__init__.py
+11
-7
list_main.py
gargantext/util/toolchain/list_main.py
+3
-0
metric_tfidf.py
gargantext/util/toolchain/metric_tfidf.py
+0
-1
ngram_coocs.py
gargantext/util/toolchain/ngram_coocs.py
+163
-89
No files found.
gargantext/util/toolchain/__init__.py
View file @
3b2d568c
...
@@ -111,15 +111,11 @@ def parse_extract_indexhyperdata(corpus):
...
@@ -111,15 +111,11 @@ def parse_extract_indexhyperdata(corpus):
group_id
=
compute_groups
(
corpus
,
stoplist_id
=
None
)
group_id
=
compute_groups
(
corpus
,
stoplist_id
=
None
)
print
(
'CORPUS #
%
d: [
%
s] new grouplist node #
%
i'
%
(
corpus
.
id
,
t
(),
group_id
))
print
(
'CORPUS #
%
d: [
%
s] new grouplist node #
%
i'
%
(
corpus
.
id
,
t
(),
group_id
))
# ------------
# -> write occurrences to Node and NodeNodeNgram # (todo: NodeNgram)
# -> write occurrences to Node and NodeNodeNgram # (todo: NodeNgram)
occ_id
=
compute_occs
(
corpus
,
groupings_id
=
group_id
)
occ_id
=
compute_occs
(
corpus
,
groupings_id
=
group_id
)
print
(
'CORPUS #
%
d: [
%
s] new occs node #
%
i'
%
(
corpus
.
id
,
t
(),
occ_id
))
print
(
'CORPUS #
%
d: [
%
s] new occs node #
%
i'
%
(
corpus
.
id
,
t
(),
occ_id
))
# ------------
# -> write local tfidf similarities to Node and NodeNodeNgram
ltfidf_id
=
compute_tfidf_local
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new localtfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
ltfidf_id
))
# -> write cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram (todo: NodeNgram)
# -> write cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram (todo: NodeNgram)
tirank_id
=
compute_ti_ranking
(
corpus
,
tirank_id
=
compute_ti_ranking
(
corpus
,
groupings_id
=
group_id
,
groupings_id
=
group_id
,
...
@@ -132,13 +128,21 @@ def parse_extract_indexhyperdata(corpus):
...
@@ -132,13 +128,21 @@ def parse_extract_indexhyperdata(corpus):
stoplist_id
=
stop_id
)
stoplist_id
=
stop_id
)
print
(
'CORPUS #
%
d: [
%
s] new mainlist node #
%
i'
%
(
corpus
.
id
,
t
(),
mainlist_id
))
print
(
'CORPUS #
%
d: [
%
s] new mainlist node #
%
i'
%
(
corpus
.
id
,
t
(),
mainlist_id
))
# -> write local tfidf similarities to Node and NodeNodeNgram
# TODO only on mainlist
ltfidf_id
=
compute_tfidf_local
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new localtfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
ltfidf_id
))
# => used for doc <=> ngram association
# ------------
# ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id
=
compute_coocs
(
corpus
,
mainlist_id
=
mainlist_id
)
cooc_id
=
compute_coocs
(
corpus
,
mainlist_id
=
mainlist_id
,
groupings_id
=
group_id
)
print
(
'CORPUS #
%
d: [
%
s] new coocs node #
%
i'
%
(
corpus
.
id
,
t
(),
cooc_id
))
print
(
'CORPUS #
%
d: [
%
s] new coocs node #
%
i'
%
(
corpus
.
id
,
t
(),
cooc_id
))
# -> specificity: compute + write (=> NodeNodeNgram)
# -> specificity: compute + write (=> NodeNodeNgram)
spec_id
=
compute_specificity
(
corpus
,
cooc_id
=
cooc_id
)
spec_id
=
compute_specificity
(
corpus
,
cooc_id
=
cooc_id
# ,groupings_id = group_id
)
print
(
'CORPUS #
%
d: [
%
s] new specificity node #
%
i'
%
(
corpus
.
id
,
t
(),
spec_id
))
print
(
'CORPUS #
%
d: [
%
s] new specificity node #
%
i'
%
(
corpus
.
id
,
t
(),
spec_id
))
# ?? maplist: compute + write (to Node and NodeNgram)
# ?? maplist: compute + write (to Node and NodeNgram)
...
...
gargantext/util/toolchain/list_main.py
View file @
3b2d568c
...
@@ -65,6 +65,9 @@ def do_mainlist(corpus,
...
@@ -65,6 +65,9 @@ def do_mainlist(corpus,
ordered_filtered_tfidf
=
(
session
ordered_filtered_tfidf
=
(
session
.
query
(
NodeNodeNgram
.
ngram_id
)
.
query
(
NodeNodeNgram
.
ngram_id
)
.
filter
(
NodeNodeNgram
.
node1_id
==
ranking_scores_id
)
.
filter
(
NodeNodeNgram
.
node1_id
==
ranking_scores_id
)
# NOT IN but speed theoretically ok here
# see http://sqlperformance.com/2012/12/t-sql-queries/left-anti-semi-join
# but http://stackoverflow.com/questions/2246772/whats-the-difference-between-not-exists-vs-not-in-vs-left-join-where-is-null/2246793#2246793
.
filter
(
~
NodeNodeNgram
.
ngram_id
.
in_
(
stopterms_subquery
))
.
filter
(
~
NodeNodeNgram
.
ngram_id
.
in_
(
stopterms_subquery
))
.
order_by
(
desc
(
NodeNodeNgram
.
score
))
.
order_by
(
desc
(
NodeNodeNgram
.
score
))
)
)
...
...
gargantext/util/toolchain/metric_tfidf.py
View file @
3b2d568c
...
@@ -63,7 +63,6 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
...
@@ -63,7 +63,6 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
# ------------
# ------------
# (the occurrences are the sums for each ngram's mainform)
# (the occurrences are the sums for each ngram's mainform)
else
:
else
:
print
(
"gtoup mode"
)
# sub-SELECT the synonyms of this GROUPLIST id (for OUTER JOIN later)
# sub-SELECT the synonyms of this GROUPLIST id (for OUTER JOIN later)
syn
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
syn
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
NodeNgramNgram
.
ngram2_id
)
...
...
gargantext/util/toolchain/ngram_coocs.py
View file @
3b2d568c
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNgramNgram
,
\
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNgramNgram
,
\
NodeHyperdata
NodeHyperdata
,
Ngram
from
gargantext.util.lists
import
WeightedMatrix
from
gargantext.util.lists
import
WeightedMatrix
from
gargantext.util.db
import
session
,
aliased
,
func
from
gargantext.util.db
import
session
,
aliased
,
func
from
gargantext.util.db_cache
import
cache
from
gargantext.util.db_cache
import
cache
from
gargantext.constants
import
DEFAULT_COOC_THRESHOLD
from
gargantext.constants
import
DEFAULT_COOC_THRESHOLD
from
datetime
import
datetime
from
datetime
import
datetime
from
sqlalchemy.sql.expression
import
case
# for choice if ngram has mainform or not
def
compute_coocs
(
corpus
,
def
compute_coocs
(
corpus
,
overwrite_id
=
None
,
overwrite_id
=
None
,
threshold
=
DEFAULT_COOC_THRESHOLD
,
threshold
=
DEFAULT_COOC_THRESHOLD
,
groupings_id
=
None
,
mainlist_id
=
None
,
mainlist_id
=
None
,
stoplist_id
=
None
,
stoplist_id
=
None
,
start
=
None
,
start
=
None
,
...
@@ -41,9 +44,11 @@ def compute_coocs( corpus,
...
@@ -41,9 +44,11 @@ def compute_coocs( corpus,
- overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
- overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
(all hyperdata and previous NodeNgramNgram rows will be replaced)
(all hyperdata and previous NodeNgramNgram rows will be replaced)
- threshold: on output cooc count (previously called hapax)
- threshold: on output cooc count (previously called hapax)
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
- mainlist_id: mainlist to constrain the input ngrams
- mainlist_id: mainlist to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams
- stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is provided)
(normally unnecessary if a mainlist is
already
provided)
- start, end: provide one or both temporal limits to filter on doc date
- start, end: provide one or both temporal limits to filter on doc date
NB the expected type of parameter value is datetime.datetime
NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow
(string is also possible but format must follow
...
@@ -56,25 +61,24 @@ def compute_coocs( corpus,
...
@@ -56,25 +61,24 @@ def compute_coocs( corpus,
basic idea for one doc
basic idea for one doc
======================
======================
each pair of ngrams sharing same doc (node_id)
each pair of ngrams sharing same doc (node_id)
SELEC idx
1.ngram_id, idx2
.ngram_id
SELEC idx
a.ngram_id, idxb
.ngram_id
FROM nodes_ngrams AS idx
1, nodes_ngrams AS idx2
FROM nodes_ngrams AS idx
a, nodes_ngrams AS idxb
---------------------------------
---------------------------------
WHERE idx
1.node_id = idx2
.node_id <== that's cooc
WHERE idx
a.node_id = idxb
.node_id <== that's cooc
---------------------------------
---------------------------------
AND idx
1.ngram_id <> idx2
.ngram_id
AND idx
a.ngram_id <> idxb
.ngram_id
AND idx
1
.node_id = MY_DOC ;
AND idx
a
.node_id = MY_DOC ;
on entire corpus
on entire corpus
=================
=================
coocs for each doc :
coocs for each doc :
- each given pair like (termA, termB) will likely appear several times
- each given pair like (termA, termB) will likely appear several times
=> we do GROUP BY (
x1.ngram_id, x2
.ngram_id)
=> we do GROUP BY (
Xindex.ngram_id, Yindex
.ngram_id)
- we count unique appearances of the pair (cooc)
- we count unique appearances of the pair (cooc)
"""
"""
# - TODO add grouped element's values in grouping 'chief ngram'
# - TODO cvalue_id: allow a metric as additional input filter
# - TODO cvalue_id: allow a metric as additional input filter
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO weighted: if False normal cooc to be saved as result
# - TODO weighted: if False normal cooc to be saved as result
...
@@ -86,124 +90,194 @@ def compute_coocs( corpus,
...
@@ -86,124 +90,194 @@ def compute_coocs( corpus,
# 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)
# 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)
# 2 x the occurrence index table
# 2 x the occurrence index table
x1
=
aliased
(
NodeNgram
)
Xindex
=
aliased
(
NodeNgram
)
x2
=
aliased
(
NodeNgram
)
Yindex
=
aliased
(
NodeNgram
)
# cooccurrences columns definition
# for debug (1/4)
ucooc
=
func
.
count
(
x1
.
ngram_id
)
.
label
(
"ucooc"
)
# Xngram = aliased(Ngram)
# Yngram = aliased(Ngram)
# 1) MAIN DB QUERY
coocs_query
=
(
# 1) prepare definition of counted forms
session
.
query
(
x1
.
ngram_id
,
x2
.
ngram_id
,
ucooc
)
if
not
groupings_id
:
.
join
(
Node
,
Node
.
id
==
x1
.
node_id
)
# <- b/c within corpus
.
join
(
x2
,
x1
.
node_id
==
Node
.
id
)
# <- b/c within corpus
# no groupings => the counted forms are the ngrams
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
# <- b/c within corpus
Xindex_ngform_id
=
Xindex
.
ngram_id
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
# <- b/c within corpus
Yindex_ngform_id
=
Yindex
.
ngram_id
.
filter
(
x1
.
node_id
==
x2
.
node_id
)
# <- by definition of cooc
.
filter
(
x1
.
ngram_id
!=
x2
.
ngram_id
)
# <- b/c not with itself
# groupings: cf commentaire détaillé dans compute_occs() + todo facto
.
group_by
(
x1
.
ngram_id
,
x2
.
ngram_id
)
else
:
# prepare translations
Xsyno
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
groupings_id
)
.
subquery
()
)
# further use as anon tables prevent doing Ysyno = Xsyno
Ysyno
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
groupings_id
)
.
subquery
()
)
# groupings => define the counted form depending on the existence of a synonym
Xindex_ngform_id
=
case
([
(
Xsyno
.
c
.
ngram1_id
!=
None
,
Xsyno
.
c
.
ngram1_id
),
(
Xsyno
.
c
.
ngram1_id
==
None
,
Xindex
.
ngram_id
)
# condition value
])
Yindex_ngform_id
=
case
([
(
Ysyno
.
c
.
ngram1_id
!=
None
,
Ysyno
.
c
.
ngram1_id
),
(
Ysyno
.
c
.
ngram1_id
==
None
,
Yindex
.
ngram_id
)
])
# ---
# 2) BASE DB QUERY
# cooccurrences columns definition ----------------
ucooc
=
func
.
count
(
Xindex_ngform_id
)
.
label
(
"ucooc"
)
# NB could be X or Y in this line
# (we're counting grouped rows and just happen to do it on this column)
base_query
=
(
session
.
query
(
Xindex_ngform_id
,
Yindex_ngform_id
,
ucooc
# for debug (2/4)
#, Xngram.terms.label("w_x")
#, Yngram.terms.label("w_y")
)
.
join
(
Yindex
,
Xindex
.
node_id
==
Yindex
.
node_id
)
# <- by definition of cooc
.
join
(
Node
,
Node
.
id
==
Xindex
.
node_id
)
# <- b/c within corpus
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
# <- b/c within corpus
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
# <- b/c within corpus
.
filter
(
Xindex_ngform_id
!=
Yindex_ngform_id
)
# <- b/c not with itself
)
# outerjoin the synonyms if needed
if
groupings_id
:
base_query
=
(
base_query
.
outerjoin
(
Xsyno
,
# <- synonyms for Xindex.ngrams
Xsyno
.
c
.
ngram2_id
==
Xindex
.
ngram_id
)
.
outerjoin
(
Ysyno
,
# <- synonyms for Yindex.ngrams
Ysyno
.
c
.
ngram2_id
==
Yindex
.
ngram_id
)
)
# 3) counting clause in any case
coocs_query
=
(
base_query
.
group_by
(
Xindex_ngform_id
,
Yindex_ngform_id
# <- what we're counting
# for debug (3/4)
#,"w_x", "w_y"
)
# for debug (4/4)
#.join(Xngram, Xngram.id == Xindex_ngform_id)
#.join(Yngram, Yngram.id == Yindex_ngform_id)
.
order_by
(
ucooc
)
)
)
# 2) INPUT FILTERS (reduce N before O(N²))
# 4) INPUT FILTERS (reduce N before O(N²))
if
mainlist_id
:
if
mainlist_id
:
m1
=
aliased
(
NodeNgram
)
m1
=
aliased
(
NodeNgram
)
m2
=
aliased
(
NodeNgram
)
m2
=
aliased
(
NodeNgram
)
coocs_query
=
(
coocs_query
coocs_query
=
(
coocs_query
.
join
(
m1
,
m1
.
ngram_id
==
x1
.
ngra
m_id
)
.
join
(
m1
,
m1
.
ngram_id
==
Xindex_ngfor
m_id
)
.
join
(
m2
,
m2
.
ngram_id
==
x2
.
ngra
m_id
)
.
join
(
m2
,
m2
.
ngram_id
==
Yindex_ngfor
m_id
)
.
filter
(
m1
.
node_id
==
mainlist_id
)
.
filter
(
m1
.
node_id
==
mainlist_id
)
.
filter
(
m2
.
node_id
==
mainlist_id
)
.
filter
(
m2
.
node_id
==
mainlist_id
)
)
)
if
stoplist_id
:
if
stoplist_id
:
s1
=
aliased
(
NodeNgram
)
s1
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
s2
=
aliased
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
stoplist_id
)
.
subquery
()
)
# further use as anon tables prevent doing s2 = s1
s2
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
stoplist_id
)
.
subquery
()
)
coocs_query
=
(
coocs_query
coocs_query
=
(
coocs_query
.
join
(
m1
,
s1
.
ngram_id
==
x1
.
ngram_id
)
.
outerjoin
(
s1
,
s1
.
c
.
ngram_id
==
Xindex_ngform_id
)
.
join
(
m2
,
s2
.
ngram_id
==
x2
.
ngram_id
)
.
outerjoin
(
s2
,
s2
.
c
.
ngram_id
==
Yindex_ngform_id
)
# équivalent NOT IN stoplist
.
filter
(
s1
.
c
.
ngram_id
==
None
)
.
filter
(
s2
.
c
.
ngram_id
==
None
)
.
filter
(
s1
.
node_id
==
mainlist_id
)
.
filter
(
s2
.
node_id
==
mainlist_id
)
)
)
if
start
:
if
start
or
end
:
if
isinstance
(
start
,
datetime
):
Time
=
aliased
(
NodeHyperdata
)
start_str
=
start
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
else
:
coocs_query
=
(
coocs_query
start_str
=
str
(
start
)
.
join
(
Time
,
Time
.
node_id
==
Xindex
.
node_id
)
.
filter
(
Time
.
key
==
"publication_date"
)
# doc_ids matching this limit
)
# TODO s/subqueries/inner joins/ && thanks!
starttime_subquery
=
(
session
if
start
:
.
query
(
NodeHyperdata
.
node_id
)
if
not
isinstance
(
start
,
datetime
):
.
filter
(
NodeHyperdata
.
key
==
"publication_date"
)
try
:
.
filter
(
NodeHyperdata
.
value_str
>=
start_str
)
start
=
datetime
.
strptime
(
start
,
'
%
Y-
%
m-
%
d'
)
.
subquery
()
except
:
)
raise
TypeError
(
"'start' param expects datetime object or
%%
Y-
%%
m-
%%
d string"
)
# direct use of str comparison op because there is consistency b/w
# sql alpha sort and chrono sort *in this format %Y-%m-%d %H:%M:%S*
# the filtering by start limit
coocs_query
=
coocs_query
.
filter
(
Time
.
value_utc
>=
start
)
# the filtering by start limit
coocs_query
=
coocs_query
.
filter
(
x1
.
node_id
.
in_
(
starttime_subquery
))
if
end
:
if
not
isinstance
(
end
,
datetime
):
if
end
:
try
:
if
isinstance
(
end
,
datetime
):
end
=
datetime
.
strptime
(
end
,
'
%
Y-
%
m-
%
d'
)
end_str
=
end
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
except
:
else
:
raise
TypeError
(
"'end' param expects datetime object or
%%
Y-
%%
m-
%%
d string"
)
end_str
=
str
(
end
)
# the filtering by start limit
# TODO s/subqueries/inner joins/ && thanks!
coocs_query
=
coocs_query
.
filter
(
Time
.
value_utc
<=
end
)
endtime_subquery
=
(
session
.
query
(
NodeHyperdata
.
node_id
)
.
filter
(
NodeHyperdata
.
key
==
"publication_date"
)
.
filter
(
NodeHyperdata
.
value_str
<=
end_str
)
.
subquery
()
)
# the filtering by end limit
coocs_query
=
coocs_query
.
filter
(
x1
.
node_id
.
in_
(
endtime_subquery
))
if
symmetry_filter
:
if
symmetry_filter
:
# 1 filtre tenant en compte de la symétrie
# 1 filtre tenant en compte de la symétrie
# -> réduit le travail de moitié !!
# -> réduit le travail de moitié !!
# -> mais empêchera l'accès direct aux cooccurrences de x2
# -> mais récupération sera plus couteuse via des requêtes OR comme:
# -> seront éparpillées: notées dans les x1 qui ont précédé x2
# -> récupération sera plus couteuse via des requêtes OR comme:
# WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram
# WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram
coocs_query
=
coocs_query
.
filter
(
x1
.
ngram_id
<
x2
.
ngram_id
)
coocs_query
=
coocs_query
.
filter
(
Xindex_ngform_id
<
Yindex_ngform_id
)
# ------------
# 2 filtres amont possibles pour réduire combinatoire
# - par exemple 929k lignes => 35k lignes
# - ici sur weight mais dégrade les résultats
# => imaginable sur une autre métrique (cvalue ou tfidf?)
# coocs_query = coocs_query.filter(x1.weight > 1)
# coocs_query = coocs_query.filter(x2.weight > 1)
# ------------
#
3
) OUTPUT FILTERS
#
5
) OUTPUT FILTERS
# ------------------
# ------------------
# threshold
# threshold
# £TODO adjust COOC_THRESHOLD a posteriori:
# £TODO adjust COOC_THRESHOLD a posteriori:
# ex: sometimes 2 sometimes 4 depending on sparsity
# ex: sometimes 2 sometimes 4 depending on sparsity
coocs_query
=
coocs_query
.
having
(
ucooc
>=
threshold
)
coocs_query
=
coocs_query
.
having
(
ucooc
>=
threshold
)
# 4) EXECUTE QUERY
# 6) EXECUTE QUERY
# ----------------
# ----------------
# => storage in our matrix structure
# => storage in our matrix structure
matrix
=
WeightedMatrix
(
coocs_query
.
all
())
matrix
=
WeightedMatrix
(
coocs_query
.
all
())
# -------------------
# fyi
# fyi
#
shape_0 = len({pair[0] for pair in matrix.items})
shape_0
=
len
({
pair
[
0
]
for
pair
in
matrix
.
items
})
#
shape_1 = len({pair[1] for pair in matrix.items})
shape_1
=
len
({
pair
[
1
]
for
pair
in
matrix
.
items
})
#
print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
print
(
"COOCS: NEW matrix shape [
%
ix
%
i]"
%
(
shape_0
,
shape_1
))
# 5) SAVE
# 5) SAVE
# --------
# --------
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment