Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
7a141a02
Commit
7a141a02
authored
May 09, 2016
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[NGRAMS] workflow fixes.
parent
91e14e3e
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
12 additions
and
4 deletions
+12
-4
list_map.py
gargantext/util/toolchain/list_map.py
+3
-3
metric_tfidf.py
gargantext/util/toolchain/metric_tfidf.py
+9
-1
No files found.
gargantext/util/toolchain/list_map.py
View file @
7a141a02
...
@@ -52,7 +52,7 @@ def do_maplist(corpus,
...
@@ -52,7 +52,7 @@ def do_maplist(corpus,
primary_groupterms_subquery
=
(
session
primary_groupterms_subquery
=
(
session
# we want only primary terms (ngram1)
# we want only primary terms (ngram1)
.
query
(
NodeNgramNgram
.
ngram
1
_id
)
.
query
(
NodeNgramNgram
.
ngram
2
_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
grouplist_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
grouplist_id
)
.
subquery
()
.
subquery
()
)
)
...
@@ -64,7 +64,7 @@ def do_maplist(corpus,
...
@@ -64,7 +64,7 @@ def do_maplist(corpus,
.
join
(
Ngram
,
Ngram
.
id
==
ScoreSpec
.
ngram_id
)
.
join
(
Ngram
,
Ngram
.
id
==
ScoreSpec
.
ngram_id
)
.
filter
(
ScoreSpec
.
node_id
==
specificity_id
)
.
filter
(
ScoreSpec
.
node_id
==
specificity_id
)
.
filter
(
ScoreSpec
.
ngram_id
.
in_
(
mainterms_subquery
))
.
filter
(
ScoreSpec
.
ngram_id
.
in_
(
mainterms_subquery
))
.
filter
(
ScoreSpec
.
ngram_id
.
in_
(
primary_groupterms_subquery
))
.
filter
(
ScoreSpec
.
ngram_id
.
not
in_
(
primary_groupterms_subquery
))
)
)
# TODO: move these 2 pools up to mainlist selection
# TODO: move these 2 pools up to mainlist selection
...
@@ -81,7 +81,7 @@ def do_maplist(corpus,
...
@@ -81,7 +81,7 @@ def do_maplist(corpus,
.
limit
(
multigrams_limit
)
.
limit
(
multigrams_limit
)
.
all
()
.
all
()
)
)
obtained_mono
=
len
(
top_monograms
)
obtained_mono
=
len
(
top_monograms
)
obtained_multi
=
len
(
top_multigrams
)
obtained_multi
=
len
(
top_multigrams
)
obtained_total
=
obtained_mono
+
obtained_multi
obtained_total
=
obtained_mono
+
obtained_multi
# print("MAPLIST: top_monograms =", obtained_mono)
# print("MAPLIST: top_monograms =", obtained_mono)
...
...
gargantext/util/toolchain/metric_tfidf.py
View file @
7a141a02
...
@@ -8,7 +8,7 @@ FIXME: "having the same source" means we need to select inside hyperdata
...
@@ -8,7 +8,7 @@ FIXME: "having the same source" means we need to select inside hyperdata
with a (perhaps costly) JSON query: WHERE hyperdata->'resources' @> ...
with a (perhaps costly) JSON query: WHERE hyperdata->'resources' @> ...
"""
"""
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
,
NodeNgramNgram
from
gargantext.util.db
import
session
,
bulk_insert
,
func
# = sqlalchemy.func like sum() or count()
from
gargantext.util.db
import
session
,
bulk_insert
,
func
# = sqlalchemy.func like sum() or count()
from
sqlalchemy
import
text
# for query from raw SQL statement
from
sqlalchemy
import
text
# for query from raw SQL statement
from
math
import
log
from
math
import
log
...
@@ -29,6 +29,13 @@ def compute_occs(corpus, overwrite_id = None):
...
@@ -29,6 +29,13 @@ def compute_occs(corpus, overwrite_id = None):
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
"""
# 0) Get the groups
group_id
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"GROUPLIST"
)
.
first
()
)
# 1) all the doc_ids of our corpus (scope of counts for filter)
# 1) all the doc_ids of our corpus (scope of counts for filter)
# slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
# slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
...
@@ -45,6 +52,7 @@ def compute_occs(corpus, overwrite_id = None):
...
@@ -45,6 +52,7 @@ def compute_occs(corpus, overwrite_id = None):
NodeNgram
.
ngram_id
,
NodeNgram
.
ngram_id
,
func
.
sum
(
NodeNgram
.
weight
)
func
.
sum
(
NodeNgram
.
weight
)
)
)
#.join(NodeNgramNgram, NodeNgramNgram.node_id == group_id)
.
filter
(
NodeNgram
.
node_id
.
in_
(
docids_subquery
))
.
filter
(
NodeNgram
.
node_id
.
in_
(
docids_subquery
))
.
group_by
(
NodeNgram
.
ngram_id
)
.
group_by
(
NodeNgram
.
ngram_id
)
.
all
()
.
all
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment