Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
3b2d568c
Commit
3b2d568c
authored
May 20, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add groups to ngram_coocs + fix date params + fix stoplist param + remove sql IN operators there
parent
92d5dfcd
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
177 additions
and
97 deletions
+177
-97
__init__.py
gargantext/util/toolchain/__init__.py
+11
-7
list_main.py
gargantext/util/toolchain/list_main.py
+3
-0
metric_tfidf.py
gargantext/util/toolchain/metric_tfidf.py
+0
-1
ngram_coocs.py
gargantext/util/toolchain/ngram_coocs.py
+163
-89
No files found.
gargantext/util/toolchain/__init__.py
View file @
3b2d568c
...
...
@@ -111,15 +111,11 @@ def parse_extract_indexhyperdata(corpus):
group_id
=
compute_groups
(
corpus
,
stoplist_id
=
None
)
print
(
'CORPUS #
%
d: [
%
s] new grouplist node #
%
i'
%
(
corpus
.
id
,
t
(),
group_id
))
# ------------
# -> write occurrences to Node and NodeNodeNgram # (todo: NodeNgram)
occ_id
=
compute_occs
(
corpus
,
groupings_id
=
group_id
)
print
(
'CORPUS #
%
d: [
%
s] new occs node #
%
i'
%
(
corpus
.
id
,
t
(),
occ_id
))
# ------------
# -> write local tfidf similarities to Node and NodeNodeNgram
ltfidf_id
=
compute_tfidf_local
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new localtfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
ltfidf_id
))
# -> write cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram (todo: NodeNgram)
tirank_id
=
compute_ti_ranking
(
corpus
,
groupings_id
=
group_id
,
...
...
@@ -132,13 +128,21 @@ def parse_extract_indexhyperdata(corpus):
stoplist_id
=
stop_id
)
print
(
'CORPUS #
%
d: [
%
s] new mainlist node #
%
i'
%
(
corpus
.
id
,
t
(),
mainlist_id
))
# -> write local tfidf similarities to Node and NodeNodeNgram
# TODO only on mainlist
ltfidf_id
=
compute_tfidf_local
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new localtfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
ltfidf_id
))
# => used for doc <=> ngram association
# ------------
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id
=
compute_coocs
(
corpus
,
mainlist_id
=
mainlist_id
)
cooc_id
=
compute_coocs
(
corpus
,
mainlist_id
=
mainlist_id
,
groupings_id
=
group_id
)
print
(
'CORPUS #
%
d: [
%
s] new coocs node #
%
i'
%
(
corpus
.
id
,
t
(),
cooc_id
))
# -> specificity: compute + write (=> NodeNodeNgram)
spec_id
=
compute_specificity
(
corpus
,
cooc_id
=
cooc_id
)
spec_id
=
compute_specificity
(
corpus
,
cooc_id
=
cooc_id
# ,groupings_id = group_id
)
print
(
'CORPUS #
%
d: [
%
s] new specificity node #
%
i'
%
(
corpus
.
id
,
t
(),
spec_id
))
# ?? maplist: compute + write (to Node and NodeNgram)
...
...
gargantext/util/toolchain/list_main.py
View file @
3b2d568c
...
...
@@ -65,6 +65,9 @@ def do_mainlist(corpus,
ordered_filtered_tfidf
=
(
session
.
query
(
NodeNodeNgram
.
ngram_id
)
.
filter
(
NodeNodeNgram
.
node1_id
==
ranking_scores_id
)
# NOT IN but speed theoretically ok here
# see http://sqlperformance.com/2012/12/t-sql-queries/left-anti-semi-join
# but http://stackoverflow.com/questions/2246772/whats-the-difference-between-not-exists-vs-not-in-vs-left-join-where-is-null/2246793#2246793
.
filter
(
~
NodeNodeNgram
.
ngram_id
.
in_
(
stopterms_subquery
))
.
order_by
(
desc
(
NodeNodeNgram
.
score
))
)
...
...
gargantext/util/toolchain/metric_tfidf.py
View file @
3b2d568c
...
...
@@ -63,7 +63,6 @@ def compute_occs(corpus, overwrite_id = None, groupings_id = None,):
# ------------
# (the occurrences are the sums for each ngram's mainform)
else
:
print
(
"gtoup mode"
)
# sub-SELECT the synonyms of this GROUPLIST id (for OUTER JOIN later)
syn
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
...
...
gargantext/util/toolchain/ngram_coocs.py
View file @
3b2d568c
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNgramNgram
,
\
NodeHyperdata
NodeHyperdata
,
Ngram
from
gargantext.util.lists
import
WeightedMatrix
from
gargantext.util.db
import
session
,
aliased
,
func
from
gargantext.util.db_cache
import
cache
from
gargantext.constants
import
DEFAULT_COOC_THRESHOLD
from
datetime
import
datetime
from
sqlalchemy.sql.expression
import
case
# for choice if ngram has mainform or not
def
compute_coocs
(
corpus
,
overwrite_id
=
None
,
threshold
=
DEFAULT_COOC_THRESHOLD
,
groupings_id
=
None
,
mainlist_id
=
None
,
stoplist_id
=
None
,
start
=
None
,
...
...
@@ -41,9 +44,11 @@ def compute_coocs( corpus,
- overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
(all hyperdata and previous NodeNgramNgram rows will be replaced)
- threshold: on output cooc count (previously called hapax)
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
- mainlist_id: mainlist to constrain the input ngrams
- stoplist_id: stoplist for filtering input ngrams
(normally unnecessary if a mainlist is provided)
(normally unnecessary if a mainlist is
already
provided)
- start, end: provide one or both temporal limits to filter on doc date
NB the expected type of parameter value is datetime.datetime
(string is also possible but format must follow
...
...
@@ -56,25 +61,24 @@ def compute_coocs( corpus,
basic idea for one doc
======================
each pair of ngrams sharing same doc (node_id)
SELEC idx
1.ngram_id, idx2
.ngram_id
FROM nodes_ngrams AS idx
1, nodes_ngrams AS idx2
SELEC idx
a.ngram_id, idxb
.ngram_id
FROM nodes_ngrams AS idx
a, nodes_ngrams AS idxb
---------------------------------
WHERE idx
1.node_id = idx2
.node_id <== that's cooc
WHERE idx
a.node_id = idxb
.node_id <== that's cooc
---------------------------------
AND idx
1.ngram_id <> idx2
.ngram_id
AND idx
1
.node_id = MY_DOC ;
AND idx
a.ngram_id <> idxb
.ngram_id
AND idx
a
.node_id = MY_DOC ;
on entire corpus
=================
coocs for each doc :
- each given pair like (termA, termB) will likely appear several times
=> we do GROUP BY (
x1.ngram_id, x2
.ngram_id)
=> we do GROUP BY (
Xindex.ngram_id, Yindex
.ngram_id)
- we count unique appearances of the pair (cooc)
"""
# - TODO add grouped element's values in grouping 'chief ngram'
# - TODO cvalue_id: allow a metric as additional input filter
# - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
# - TODO weighted: if False normal cooc to be saved as result
...
...
@@ -86,124 +90,194 @@ def compute_coocs( corpus,
# 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)
# 2 x the occurrence index table
x1
=
aliased
(
NodeNgram
)
x2
=
aliased
(
NodeNgram
)
# cooccurrences columns definition
ucooc
=
func
.
count
(
x1
.
ngram_id
)
.
label
(
"ucooc"
)
# 1) MAIN DB QUERY
coocs_query
=
(
session
.
query
(
x1
.
ngram_id
,
x2
.
ngram_id
,
ucooc
)
.
join
(
Node
,
Node
.
id
==
x1
.
node_id
)
# <- b/c within corpus
.
join
(
x2
,
x1
.
node_id
==
Node
.
id
)
# <- b/c within corpus
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
# <- b/c within corpus
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
# <- b/c within corpus
.
filter
(
x1
.
node_id
==
x2
.
node_id
)
# <- by definition of cooc
.
filter
(
x1
.
ngram_id
!=
x2
.
ngram_id
)
# <- b/c not with itself
.
group_by
(
x1
.
ngram_id
,
x2
.
ngram_id
)
Xindex
=
aliased
(
NodeNgram
)
Yindex
=
aliased
(
NodeNgram
)
# for debug (1/4)
# Xngram = aliased(Ngram)
# Yngram = aliased(Ngram)
# 1) prepare definition of counted forms
if
not
groupings_id
:
# no groupings => the counted forms are the ngrams
Xindex_ngform_id
=
Xindex
.
ngram_id
Yindex_ngform_id
=
Yindex
.
ngram_id
# groupings: cf commentaire détaillé dans compute_occs() + todo facto
else
:
# prepare translations
Xsyno
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
groupings_id
)
.
subquery
()
)
# further use as anon tables prevent doing Ysyno = Xsyno
Ysyno
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
groupings_id
)
.
subquery
()
)
# groupings => define the counted form depending on the existence of a synonym
Xindex_ngform_id
=
case
([
(
Xsyno
.
c
.
ngram1_id
!=
None
,
Xsyno
.
c
.
ngram1_id
),
(
Xsyno
.
c
.
ngram1_id
==
None
,
Xindex
.
ngram_id
)
# condition value
])
Yindex_ngform_id
=
case
([
(
Ysyno
.
c
.
ngram1_id
!=
None
,
Ysyno
.
c
.
ngram1_id
),
(
Ysyno
.
c
.
ngram1_id
==
None
,
Yindex
.
ngram_id
)
])
# ---
# 2) BASE DB QUERY
# cooccurrences columns definition ----------------
ucooc
=
func
.
count
(
Xindex_ngform_id
)
.
label
(
"ucooc"
)
# NB could be X or Y in this line
# (we're counting grouped rows and just happen to do it on this column)
base_query
=
(
session
.
query
(
Xindex_ngform_id
,
Yindex_ngform_id
,
ucooc
# for debug (2/4)
#, Xngram.terms.label("w_x")
#, Yngram.terms.label("w_y")
)
.
join
(
Yindex
,
Xindex
.
node_id
==
Yindex
.
node_id
)
# <- by definition of cooc
.
join
(
Node
,
Node
.
id
==
Xindex
.
node_id
)
# <- b/c within corpus
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
# <- b/c within corpus
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
# <- b/c within corpus
.
filter
(
Xindex_ngform_id
!=
Yindex_ngform_id
)
# <- b/c not with itself
)
# outerjoin the synonyms if needed
if
groupings_id
:
base_query
=
(
base_query
.
outerjoin
(
Xsyno
,
# <- synonyms for Xindex.ngrams
Xsyno
.
c
.
ngram2_id
==
Xindex
.
ngram_id
)
.
outerjoin
(
Ysyno
,
# <- synonyms for Yindex.ngrams
Ysyno
.
c
.
ngram2_id
==
Yindex
.
ngram_id
)
)
# 3) counting clause in any case
coocs_query
=
(
base_query
.
group_by
(
Xindex_ngform_id
,
Yindex_ngform_id
# <- what we're counting
# for debug (3/4)
#,"w_x", "w_y"
)
# for debug (4/4)
#.join(Xngram, Xngram.id == Xindex_ngform_id)
#.join(Yngram, Yngram.id == Yindex_ngform_id)
.
order_by
(
ucooc
)
)
# 2) INPUT FILTERS (reduce N before O(N²))
# 4) INPUT FILTERS (reduce N before O(N²))
if
mainlist_id
:
m1
=
aliased
(
NodeNgram
)
m2
=
aliased
(
NodeNgram
)
coocs_query
=
(
coocs_query
.
join
(
m1
,
m1
.
ngram_id
==
x1
.
ngra
m_id
)
.
join
(
m2
,
m2
.
ngram_id
==
x2
.
ngra
m_id
)
.
join
(
m1
,
m1
.
ngram_id
==
Xindex_ngfor
m_id
)
.
join
(
m2
,
m2
.
ngram_id
==
Yindex_ngfor
m_id
)
.
filter
(
m1
.
node_id
==
mainlist_id
)
.
filter
(
m2
.
node_id
==
mainlist_id
)
)
if
stoplist_id
:
s1
=
aliased
(
NodeNgram
)
s2
=
aliased
(
NodeNgram
)
s1
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
stoplist_id
)
.
subquery
()
)
# further use as anon tables prevent doing s2 = s1
s2
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
stoplist_id
)
.
subquery
()
)
coocs_query
=
(
coocs_query
.
join
(
m1
,
s1
.
ngram_id
==
x1
.
ngram_id
)
.
join
(
m2
,
s2
.
ngram_id
==
x2
.
ngram_id
)
.
outerjoin
(
s1
,
s1
.
c
.
ngram_id
==
Xindex_ngform_id
)
.
outerjoin
(
s2
,
s2
.
c
.
ngram_id
==
Yindex_ngform_id
)
# équivalent NOT IN stoplist
.
filter
(
s1
.
c
.
ngram_id
==
None
)
.
filter
(
s2
.
c
.
ngram_id
==
None
)
.
filter
(
s1
.
node_id
==
mainlist_id
)
.
filter
(
s2
.
node_id
==
mainlist_id
)
)
if
start
:
if
isinstance
(
start
,
datetime
):
start_str
=
start
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
else
:
start_str
=
str
(
start
)
# doc_ids matching this limit
# TODO s/subqueries/inner joins/ && thanks!
starttime_subquery
=
(
session
.
query
(
NodeHyperdata
.
node_id
)
.
filter
(
NodeHyperdata
.
key
==
"publication_date"
)
.
filter
(
NodeHyperdata
.
value_str
>=
start_str
)
.
subquery
()
)
# direct use of str comparison op because there is consistency b/w
# sql alpha sort and chrono sort *in this format %Y-%m-%d %H:%M:%S*
# the filtering by start limit
coocs_query
=
coocs_query
.
filter
(
x1
.
node_id
.
in_
(
starttime_subquery
))
if
end
:
if
isinstance
(
end
,
datetime
):
end_str
=
end
.
strftime
(
"
%
Y-
%
m-
%
d
%
H:
%
M:
%
S"
)
else
:
end_str
=
str
(
end
)
# TODO s/subqueries/inner joins/ && thanks!
endtime_subquery
=
(
session
.
query
(
NodeHyperdata
.
node_id
)
.
filter
(
NodeHyperdata
.
key
==
"publication_date"
)
.
filter
(
NodeHyperdata
.
value_str
<=
end_str
)
.
subquery
()
)
# the filtering by end limit
coocs_query
=
coocs_query
.
filter
(
x1
.
node_id
.
in_
(
endtime_subquery
))
if
start
or
end
:
Time
=
aliased
(
NodeHyperdata
)
coocs_query
=
(
coocs_query
.
join
(
Time
,
Time
.
node_id
==
Xindex
.
node_id
)
.
filter
(
Time
.
key
==
"publication_date"
)
)
if
start
:
if
not
isinstance
(
start
,
datetime
):
try
:
start
=
datetime
.
strptime
(
start
,
'
%
Y-
%
m-
%
d'
)
except
:
raise
TypeError
(
"'start' param expects datetime object or
%%
Y-
%%
m-
%%
d string"
)
# the filtering by start limit
coocs_query
=
coocs_query
.
filter
(
Time
.
value_utc
>=
start
)
if
end
:
if
not
isinstance
(
end
,
datetime
):
try
:
end
=
datetime
.
strptime
(
end
,
'
%
Y-
%
m-
%
d'
)
except
:
raise
TypeError
(
"'end' param expects datetime object or
%%
Y-
%%
m-
%%
d string"
)
# the filtering by start limit
coocs_query
=
coocs_query
.
filter
(
Time
.
value_utc
<=
end
)
if
symmetry_filter
:
# 1 filtre tenant en compte de la symétrie
# -> réduit le travail de moitié !!
# -> mais empêchera l'accès direct aux cooccurrences de x2
# -> seront éparpillées: notées dans les x1 qui ont précédé x2
# -> récupération sera plus couteuse via des requêtes OR comme:
# -> mais récupération sera plus couteuse via des requêtes OR comme:
# WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram
coocs_query
=
coocs_query
.
filter
(
x1
.
ngram_id
<
x2
.
ngram_id
)
# ------------
# 2 filtres amont possibles pour réduire combinatoire
# - par exemple 929k lignes => 35k lignes
# - ici sur weight mais dégrade les résultats
# => imaginable sur une autre métrique (cvalue ou tfidf?)
# coocs_query = coocs_query.filter(x1.weight > 1)
# coocs_query = coocs_query.filter(x2.weight > 1)
# ------------
coocs_query
=
coocs_query
.
filter
(
Xindex_ngform_id
<
Yindex_ngform_id
)
#
3
) OUTPUT FILTERS
#
5
) OUTPUT FILTERS
# ------------------
# threshold
# £TODO adjust COOC_THRESHOLD a posteriori:
# ex: sometimes 2 sometimes 4 depending on sparsity
coocs_query
=
coocs_query
.
having
(
ucooc
>=
threshold
)
# 4) EXECUTE QUERY
# 6) EXECUTE QUERY
# ----------------
# => storage in our matrix structure
matrix
=
WeightedMatrix
(
coocs_query
.
all
())
# -------------------
# fyi
#
shape_0 = len({pair[0] for pair in matrix.items})
#
shape_1 = len({pair[1] for pair in matrix.items})
#
print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))
shape_0
=
len
({
pair
[
0
]
for
pair
in
matrix
.
items
})
shape_1
=
len
({
pair
[
1
]
for
pair
in
matrix
.
items
})
print
(
"COOCS: NEW matrix shape [
%
ix
%
i]"
%
(
shape_0
,
shape_1
))
# 5) SAVE
# --------
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment