Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
744ec7f1
Commit
744ec7f1
authored
Mar 11, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
mainlist creation
parent
89c8268c
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
100 additions
and
7 deletions
+100
-7
constants.py
gargantext/constants.py
+3
-1
__init__.py
gargantext/util/toolchain/__init__.py
+15
-6
list_main.py
gargantext/util/toolchain/list_main.py
+82
-0
No files found.
gargantext/constants.py
View file @
744ec7f1
...
...
@@ -93,7 +93,9 @@ RESOURCETYPES = [
]
# linguistic extraction parameters
DEFAULT_COOC_THRESHOLD
=
4
DEFAULT_TFIDF_CUTOFF_RATIO
=
.55
# for MAINLIST maximum terms
DEFAULT_TFIDF_HARD_LIMIT
=
1000
# for MAINLIST maximum terms
DEFAULT_COOC_THRESHOLD
=
4
# for COOCCURRENCES node
# other parameters
# default number of docs POSTed to scrappers.views.py
...
...
gargantext/util/toolchain/__init__.py
View file @
744ec7f1
from
.parsing
import
parse
from
.ngrams_extraction
import
extract_ngrams
from
.list_stop
import
compute_stop
from
.list_stop
import
do_stoplist
from
.ngram_scores
import
compute_occurrences_local
,
compute_tfidf
from
.list_main
import
do_mainlist
from
.ngram_coocs_tempo
import
compute_coocs
from
.score_specificity
import
compute_specificity
from
.list_map
import
compute_mapList
# TEST
...
...
@@ -24,6 +25,12 @@ def parse_extract(corpus):
# apply actions
print
(
'CORPUS #
%
d'
%
(
corpus
.
id
))
parse
(
corpus
)
# was there an error in the process ?
if
corpus
.
status
()[
'error'
]:
print
(
"ERROR: aborting parse_extract for corpus #
%
i"
%
corpus_id
)
return
None
print
(
'CORPUS #
%
d: parsed'
%
(
corpus
.
id
))
extract_ngrams
(
corpus
)
print
(
'CORPUS #
%
d: extracted ngrams'
%
(
corpus
.
id
))
...
...
@@ -45,16 +52,16 @@ def parse_extract(corpus):
gtfidf_id
=
compute_tfidf
(
corpus
,
scope
=
"global"
)
print
(
'CORPUS #
%
d: [
%
s] new globaltfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
gtfidf_id
))
#
??
mainlist: compute + write (to Node and NodeNgram)
# mainlist_id = compute_mainlist(corpus
)
#
print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id))
#
->
mainlist: compute + write (to Node and NodeNgram)
mainlist_id
=
mainlist_filter
(
corpus
,
tfidf_id
=
gtfidf_id
,
stoplist_id
=
stop_id
)
print
(
'CORPUS #
%
d: [
%
s] new mainlist node #
%
i'
%
(
corpus
.
id
,
t
(),
mainlist_id
))
# -> cooccurrences: compute + write (=> Node and NodeNodeNgram)
cooc_id
=
compute_coocs
(
corpus
,
stop_id
=
None
)
cooc_id
=
compute_coocs
(
corpus
,
mainlist_id
=
mainlist_id
,
stop_id
=
None
)
print
(
'CORPUS #
%
d: [
%
s] new cooccs node #
%
i'
%
(
corpus
.
id
,
t
(),
cooc_id
))
# ?? specificity: compute + write (=> NodeNodeNgram)
spec_id
=
compute_specificity
(
co
oc_id
=
cooc_id
,
corpus
=
corpus
)
spec_id
=
compute_specificity
(
co
rpus
,
cooc_id
=
cooc_id
)
print
(
'CORPUS #
%
d: [
%
s] new specificity node #
%
i'
%
(
corpus
.
id
,
t
(),
cooc_id
))
# ?? maplist: compute + write (to Node and NodeNgram)
...
...
@@ -70,5 +77,7 @@ def parse_extract(corpus):
print
(
'CORPUS #
%
d: [
%
s] new grouplist node #
%
i'
%
(
corpus
.
id
,
t
(),
group_id
))
def
t
():
return
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d_
%
H:
%
M:
%
S"
)
gargantext/util/toolchain/list_main.py
0 → 100644
View file @
744ec7f1
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
from
gargantext.util.db
import
session
from
gargantext.util.lists
import
UnweightedList
from
sqlalchemy
import
desc
from
gargantext.constants
import
DEFAULT_TFIDF_CUTOFF_RATIO
,
DEFAULT_TFIDF_HARD_LIMIT
from
math
import
floor
def
do_mainlist
(
corpus
,
tfidf_id
=
None
,
stoplist_id
=
None
,
hard_limit
=
DEFAULT_TFIDF_HARD_LIMIT
,
ratio_limit
=
DEFAULT_TFIDF_CUTOFF_RATIO
):
"""
Select terms for the mainlist according to a global tfidf and stoplist.
The number of selected terms will be:
min(hard_limit, number_of_terms * ratio_limit)
NB : We use a global tfidf node where the values are global but the ngrams
are already selected (== only within this corpus documents).
Parameters:
2 limits are useful to set a maximum amount of picked terms
- ratio_limit: relative to the number of distinct ngrams [0,1]
- hard_limit: absolute value [default: 1000]
"""
# retrieve helper nodes if not provided
if
not
tfidf_id
:
tfidf_id
=
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
typename
==
"TFIDF-GLOBAL"
,
Node
.
parent_id
==
corpus
.
id
)
.
first
()
if
not
tfidf_id
:
raise
ValueError
(
"MAINLIST: TFIDF node needed for mainlist creation"
)
if
not
stoplist_id
:
stoplist_id
=
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
typename
==
"STOPLIST"
,
Node
.
parent_id
==
corpus
.
id
)
.
first
()
if
not
stoplist_id
:
raise
ValueError
(
"MAINLIST: STOPLIST node needed for mainlist creation"
)
# the ngrams we don't want
# NOTE: keep sure we do this only once during the ngram initial workflow
stopterms_subquery
=
(
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
stoplist_id
)
.
subquery
()
)
# tfidf-ranked query
ordered_filtered_tfidf
=
(
session
.
query
(
NodeNodeNgram
.
ngram_id
)
.
filter
(
NodeNodeNgram
.
node1_id
==
tfidf_id
)
.
filter
(
~
NodeNodeNgram
.
ngram_id
.
in_
(
stopterms_subquery
))
.
order_by
(
desc
(
NodeNodeNgram
.
score
))
)
# total count
nb_ngrams
=
ordered_filtered_tfidf
.
count
()
# apply ratio to find smallest limit
our_limit
=
min
(
hard_limit
,
floor
(
nb_ngrams
*
ratio_limit
))
# DB retrieve up to limit => MAINLIST
top_ngrams_ids
=
ordered_filtered_tfidf
.
limit
(
our_limit
)
.
all
()
# now create the new MAINLIST node
mainlist
=
corpus
.
add_child
(
typename
=
"MAINLIST"
,
name
=
"Mainlist (in:
%
s)"
%
corpus
.
name
[
0
:
10
]
)
session
.
add
(
mainlist
)
session
.
commit
()
the_id
=
mainlist
.
id
# create UnweightedList object and save (=> new NodeNgram rows)
UnweightedList
(
top_ngrams_ids
)
.
save
(
the_id
)
return
the_id
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment