Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
33ba94b8
Commit
33ba94b8
authored
Mar 14, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
tidying up
parent
6260e8c1
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
12 additions
and
137 deletions
+12
-137
__init__.py
gargantext/util/toolchain/__init__.py
+8
-8
list_map.py
gargantext/util/toolchain/list_map.py
+1
-0
metric_specificity.py
gargantext/util/toolchain/metric_specificity.py
+0
-0
metric_tfidf.py
gargantext/util/toolchain/metric_tfidf.py
+1
-1
ngram_coocs.py
gargantext/util/toolchain/ngram_coocs.py
+2
-1
ngrams_tools.py
gargantext/util/toolchain/ngrams_tools.py
+0
-68
score_occurrences.py
gargantext/util/toolchain/score_occurrences.py
+0
-59
No files found.
gargantext/util/toolchain/__init__.py
View file @
33ba94b8
...
@@ -2,13 +2,13 @@ from .parsing import parse
...
@@ -2,13 +2,13 @@ from .parsing import parse
from
.ngrams_extraction
import
extract_ngrams
from
.ngrams_extraction
import
extract_ngrams
# in usual run order
# in usual run order
from
.list_stop
import
do_stoplist
from
.list_stop
import
do_stoplist
from
.
ngram_scores
import
compute_occurrences_local
,
compute_tfidf
from
.
metric_tfidf
import
compute_occs
,
compute_tfidf
from
.list_main
import
do_mainlist
from
.list_main
import
do_mainlist
from
.ngram_coocs
_tempo
import
compute_coocs
from
.ngram_coocs
import
compute_coocs
from
.
score_specificity
import
compute_specificity
from
.
metric_specificity
import
compute_specificity
from
.list_map
import
do_maplist
# TEST
from
.list_map
import
do_maplist
# TEST
from
.ngram_groups
import
compute_groups
from
.ngram_groups
import
compute_groups
from
gargantext.util.db
import
session
from
gargantext.util.db
import
session
from
gargantext.models
import
Node
from
gargantext.models
import
Node
...
@@ -50,7 +50,7 @@ def parse_extract(corpus):
...
@@ -50,7 +50,7 @@ def parse_extract(corpus):
print
(
'CORPUS #
%
d: [
%
s] new grouplist node #
%
i'
%
(
corpus
.
id
,
t
(),
group_id
))
print
(
'CORPUS #
%
d: [
%
s] new grouplist node #
%
i'
%
(
corpus
.
id
,
t
(),
group_id
))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id
=
compute_occ
urrences_local
(
corpus
)
occ_id
=
compute_occ
s
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new occs node #
%
i'
%
(
corpus
.
id
,
t
(),
occ_id
))
print
(
'CORPUS #
%
d: [
%
s] new occs node #
%
i'
%
(
corpus
.
id
,
t
(),
occ_id
))
# ------------
# ------------
...
...
gargantext/util/toolchain/list_map.py
View file @
33ba94b8
...
@@ -67,6 +67,7 @@ def do_maplist(corpus,
...
@@ -67,6 +67,7 @@ def do_maplist(corpus,
.
filter
(
ScoreSpec
.
ngram_id
.
in_
(
primary_groupterms_subquery
))
.
filter
(
ScoreSpec
.
ngram_id
.
in_
(
primary_groupterms_subquery
))
)
)
# TODO: move these 2 pools up to mainlist selection
top_monograms
=
(
query
top_monograms
=
(
query
.
filter
(
Ngram
.
n
==
1
)
.
filter
(
Ngram
.
n
==
1
)
.
order_by
(
desc
(
ScoreSpec
.
weight
))
.
order_by
(
desc
(
ScoreSpec
.
weight
))
...
...
gargantext/util/toolchain/
score
_specificity.py
→
gargantext/util/toolchain/
metric
_specificity.py
View file @
33ba94b8
File moved
gargantext/util/toolchain/
ngram_scores
.py
→
gargantext/util/toolchain/
metric_tfidf
.py
View file @
33ba94b8
...
@@ -16,7 +16,7 @@ from math import log
...
@@ -16,7 +16,7 @@ from math import log
# from gargantext.util.lists import WeightedContextIndex
# from gargantext.util.lists import WeightedContextIndex
def
compute_occ
urrences_local
(
corpus
,
overwrite_id
=
None
):
def
compute_occ
s
(
corpus
,
overwrite_id
=
None
):
"""
"""
Calculates sum of occs per ngram within corpus
Calculates sum of occs per ngram within corpus
(used as info in the ngrams table view)
(used as info in the ngrams table view)
...
...
gargantext/util/toolchain/ngram_coocs
_tempo
.py
→
gargantext/util/toolchain/ngram_coocs.py
View file @
33ba94b8
...
@@ -103,7 +103,6 @@ def compute_coocs(corpus,
...
@@ -103,7 +103,6 @@ def compute_coocs(corpus,
)
)
# 2) INPUT FILTERS (reduce N before O(N²))
# 2) INPUT FILTERS (reduce N before O(N²))
# £TODO add possibility to restrict to the mainlist
if
mainlist_id
:
if
mainlist_id
:
main_subquery
=
(
main_subquery
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
session
.
query
(
NodeNgram
.
ngram_id
)
...
@@ -150,6 +149,8 @@ def compute_coocs(corpus,
...
@@ -150,6 +149,8 @@ def compute_coocs(corpus,
# 3) OUTPUT FILTERS
# 3) OUTPUT FILTERS
# ------------------
# ------------------
# threshold
# threshold
# £TODO adjust COOC_THRESHOLD a posteriori:
# ex: sometimes 2 sometimes 4 depending on sparsity
coocs_query
=
coocs_query
.
having
(
ucooc
>=
threshold
)
coocs_query
=
coocs_query
.
having
(
ucooc
>=
threshold
)
# 4) EXECUTE QUERY
# 4) EXECUTE QUERY
...
...
gargantext/util/toolchain/ngrams_tools.py
deleted
100644 → 0
View file @
6260e8c1
from
gargantext.util.db
import
*
from
gargantext.util.db_cache
import
*
from
gargantext.constants
import
*
from
gargantext.models.ngrams
import
Ngram
,
NodeNgram
,
NodeNgramNgram
def
insert_ngrams
(
ngrams
,
get
=
'terms-id'
):
'''
insert_ngrams :: [(String, Int)] -> dict[terms] = id
'''
db
,
cursor
=
get_cursor
()
cursor
.
execute
(
'''
CREATE TEMPORARY TABLE tmp__ngram (
id INT,
terms VARCHAR(255) NOT NULL,
n INT
);
'''
)
bulk_insert
(
'tmp__ngram'
,
[
'terms'
,
'n'
],
ngrams
,
cursor
=
cursor
)
cursor
.
execute
(
'''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%
s AS ngram
WHERE
tmp__ngram.terms = ngram.terms
'''
%
(
Ngram
.
__table__
.
name
,))
cursor
.
execute
(
'''
INSERT INTO
%
s (terms, n)
SELECT
terms, n
FROM
tmp__ngram
WHERE
id IS NULL
'''
%
(
Ngram
.
__table__
.
name
,))
cursor
.
execute
(
'''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%
s AS ngram
WHERE
ngram.terms = tmp__ngram.terms
AND
ngram.n = tmp__ngram.n
AND
tmp__ngram.id IS NULL
'''
%
(
Ngram
.
__table__
.
name
,))
ngram_ids
=
dict
()
cursor
.
execute
(
'SELECT id, terms FROM tmp__ngram'
)
for
row
in
cursor
.
fetchall
():
ngram_ids
[
row
[
1
]]
=
row
[
0
]
db
.
commit
()
return
(
ngram_ids
)
gargantext/util/toolchain/score_occurrences.py
deleted
100644 → 0
View file @
6260e8c1
from
gargantext_web.db
import
get_session
,
cache
,
get_cursor
from
gargantext_web.db
import
Node
,
NodeNgram
,
NodeNodeNgram
from
gargantext_web.db
import
get_or_create_node
#from admin.utils import DebugTime
def
compute_occs
(
corpus
,
debug
=
True
):
'''
compute_occs :: Corpus -> IO ()
'''
#dbg = DebugTime('Corpus #%d - OCCURRENCES' % corpus.id)
#dbg.show('Calculate occurrences')
occs_node
=
get_or_create_node
(
nodetype
=
'Occurrences'
,
corpus
=
corpus
,
mysession
=
mysession
)
#print(occs_node.id)
(
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
occs_node
.
id
)
.
delete
()
)
session
.
commit
()
db
,
cursor
=
get_cursor
()
cursor
.
execute
(
'''
INSERT INTO
%
s (nodex_id, nodey_id, ngram_id, score)
SELECT
%
d AS nodex_id,
%
d AS nodey_id,
nodengram.ngram_id AS ngram_id,
SUM(nodengram.weight) AS score
FROM
%
s AS nodengram
INNER JOIN
%
s AS node ON nodengram.node_id = node.id
WHERE
node.parent_id =
%
d
AND
node.type_id =
%
d
GROUP BY
nodengram.ngram_id
'''
%
(
NodeNodeNgram
.
__table__
.
name
,
occs_node
.
id
,
corpus
.
id
,
NodeNgram
.
__table__
.
name
,
Node
.
__table__
.
name
,
corpus
.
id
,
cache
.
NodeType
[
'Document'
]
.
id
)
)
db
.
commit
()
if
debug
is
True
:
data
=
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
occs_node
.
id
)
.
all
()
print
([
n
for
n
in
data
])
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment