Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
144b774e
Commit
144b774e
authored
9 years ago
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
tidying up
parent
58aa990d
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
12 additions
and
137 deletions
+12
-137
__init__.py
gargantext/util/toolchain/__init__.py
+8
-8
list_map.py
gargantext/util/toolchain/list_map.py
+1
-0
metric_specificity.py
gargantext/util/toolchain/metric_specificity.py
+0
-0
metric_tfidf.py
gargantext/util/toolchain/metric_tfidf.py
+1
-1
ngram_coocs.py
gargantext/util/toolchain/ngram_coocs.py
+2
-1
ngrams_tools.py
gargantext/util/toolchain/ngrams_tools.py
+0
-68
score_occurrences.py
gargantext/util/toolchain/score_occurrences.py
+0
-59
No files found.
gargantext/util/toolchain/__init__.py
View file @
144b774e
...
...
@@ -2,13 +2,13 @@ from .parsing import parse
from
.ngrams_extraction
import
extract_ngrams
# in usual run order
from
.list_stop
import
do_stoplist
from
.
ngram_scores
import
compute_occurrences_local
,
compute_tfidf
from
.list_main
import
do_mainlist
from
.ngram_coocs
_tempo
import
compute_coocs
from
.
score_specificity
import
compute_specificity
from
.list_map
import
do_maplist
# TEST
from
.ngram_groups
import
compute_groups
from
.list_stop
import
do_stoplist
from
.
metric_tfidf
import
compute_occs
,
compute_tfidf
from
.list_main
import
do_mainlist
from
.ngram_coocs
import
compute_coocs
from
.
metric_specificity
import
compute_specificity
from
.list_map
import
do_maplist
# TEST
from
.ngram_groups
import
compute_groups
from
gargantext.util.db
import
session
from
gargantext.models
import
Node
...
...
@@ -50,7 +50,7 @@ def parse_extract(corpus):
print
(
'CORPUS #
%
d: [
%
s] new grouplist node #
%
i'
%
(
corpus
.
id
,
t
(),
group_id
))
# -> write occurrences to Node and NodeNodeNgram # possible: factorize with tfidf
occ_id
=
compute_occ
urrences_local
(
corpus
)
occ_id
=
compute_occ
s
(
corpus
)
print
(
'CORPUS #
%
d: [
%
s] new occs node #
%
i'
%
(
corpus
.
id
,
t
(),
occ_id
))
# ------------
...
...
This diff is collapsed.
Click to expand it.
gargantext/util/toolchain/list_map.py
View file @
144b774e
...
...
@@ -67,6 +67,7 @@ def do_maplist(corpus,
.
filter
(
ScoreSpec
.
ngram_id
.
in_
(
primary_groupterms_subquery
))
)
# TODO: move these 2 pools up to mainlist selection
top_monograms
=
(
query
.
filter
(
Ngram
.
n
==
1
)
.
order_by
(
desc
(
ScoreSpec
.
weight
))
...
...
This diff is collapsed.
Click to expand it.
gargantext/util/toolchain/
score
_specificity.py
→
gargantext/util/toolchain/
metric
_specificity.py
View file @
144b774e
File moved
This diff is collapsed.
Click to expand it.
gargantext/util/toolchain/
ngram_scores
.py
→
gargantext/util/toolchain/
metric_tfidf
.py
View file @
144b774e
...
...
@@ -16,7 +16,7 @@ from math import log
# from gargantext.util.lists import WeightedContextIndex
def
compute_occ
urrences_local
(
corpus
,
overwrite_id
=
None
):
def
compute_occ
s
(
corpus
,
overwrite_id
=
None
):
"""
Calculates sum of occs per ngram within corpus
(used as info in the ngrams table view)
...
...
This diff is collapsed.
Click to expand it.
gargantext/util/toolchain/ngram_coocs
_tempo
.py
→
gargantext/util/toolchain/ngram_coocs.py
View file @
144b774e
...
...
@@ -103,7 +103,6 @@ def compute_coocs(corpus,
)
# 2) INPUT FILTERS (reduce N before O(N²))
# £TODO add possibility to restrict to the mainlist
if
mainlist_id
:
main_subquery
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
...
...
@@ -150,6 +149,8 @@ def compute_coocs(corpus,
# 3) OUTPUT FILTERS
# ------------------
# threshold
# £TODO adjust COOC_THRESHOLD a posteriori:
# ex: sometimes 2 sometimes 4 depending on sparsity
coocs_query
=
coocs_query
.
having
(
ucooc
>=
threshold
)
# 4) EXECUTE QUERY
...
...
This diff is collapsed.
Click to expand it.
gargantext/util/toolchain/ngrams_tools.py
deleted
100644 → 0
View file @
58aa990d
from
gargantext.util.db
import
*
from
gargantext.util.db_cache
import
*
from
gargantext.constants
import
*
from
gargantext.models.ngrams
import
Ngram
,
NodeNgram
,
NodeNgramNgram
def
insert_ngrams
(
ngrams
,
get
=
'terms-id'
):
'''
insert_ngrams :: [(String, Int)] -> dict[terms] = id
'''
db
,
cursor
=
get_cursor
()
cursor
.
execute
(
'''
CREATE TEMPORARY TABLE tmp__ngram (
id INT,
terms VARCHAR(255) NOT NULL,
n INT
);
'''
)
bulk_insert
(
'tmp__ngram'
,
[
'terms'
,
'n'
],
ngrams
,
cursor
=
cursor
)
cursor
.
execute
(
'''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%
s AS ngram
WHERE
tmp__ngram.terms = ngram.terms
'''
%
(
Ngram
.
__table__
.
name
,))
cursor
.
execute
(
'''
INSERT INTO
%
s (terms, n)
SELECT
terms, n
FROM
tmp__ngram
WHERE
id IS NULL
'''
%
(
Ngram
.
__table__
.
name
,))
cursor
.
execute
(
'''
UPDATE
tmp__ngram
SET
id = ngram.id
FROM
%
s AS ngram
WHERE
ngram.terms = tmp__ngram.terms
AND
ngram.n = tmp__ngram.n
AND
tmp__ngram.id IS NULL
'''
%
(
Ngram
.
__table__
.
name
,))
ngram_ids
=
dict
()
cursor
.
execute
(
'SELECT id, terms FROM tmp__ngram'
)
for
row
in
cursor
.
fetchall
():
ngram_ids
[
row
[
1
]]
=
row
[
0
]
db
.
commit
()
return
(
ngram_ids
)
This diff is collapsed.
Click to expand it.
gargantext/util/toolchain/score_occurrences.py
deleted
100644 → 0
View file @
58aa990d
from
gargantext_web.db
import
get_session
,
cache
,
get_cursor
from
gargantext_web.db
import
Node
,
NodeNgram
,
NodeNodeNgram
from
gargantext_web.db
import
get_or_create_node
#from admin.utils import DebugTime
def
compute_occs
(
corpus
,
debug
=
True
):
'''
compute_occs :: Corpus -> IO ()
'''
#dbg = DebugTime('Corpus #%d - OCCURRENCES' % corpus.id)
#dbg.show('Calculate occurrences')
occs_node
=
get_or_create_node
(
nodetype
=
'Occurrences'
,
corpus
=
corpus
,
mysession
=
mysession
)
#print(occs_node.id)
(
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
occs_node
.
id
)
.
delete
()
)
session
.
commit
()
db
,
cursor
=
get_cursor
()
cursor
.
execute
(
'''
INSERT INTO
%
s (nodex_id, nodey_id, ngram_id, score)
SELECT
%
d AS nodex_id,
%
d AS nodey_id,
nodengram.ngram_id AS ngram_id,
SUM(nodengram.weight) AS score
FROM
%
s AS nodengram
INNER JOIN
%
s AS node ON nodengram.node_id = node.id
WHERE
node.parent_id =
%
d
AND
node.type_id =
%
d
GROUP BY
nodengram.ngram_id
'''
%
(
NodeNodeNgram
.
__table__
.
name
,
occs_node
.
id
,
corpus
.
id
,
NodeNgram
.
__table__
.
name
,
Node
.
__table__
.
name
,
corpus
.
id
,
cache
.
NodeType
[
'Document'
]
.
id
)
)
db
.
commit
()
if
debug
is
True
:
data
=
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
occs_node
.
id
)
.
all
()
print
([
n
for
n
in
data
])
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment