Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
b548874f
Commit
b548874f
authored
Jul 05, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
maplist creation from spec/gen metrics
parent
d9b1cf7b
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
107 additions
and
33 deletions
+107
-33
list_map.py
gargantext/util/toolchain/list_map.py
+107
-33
No files found.
gargantext/util/toolchain/list_map.py
View file @
b548874f
...
...
@@ -27,23 +27,31 @@ def do_maplist(corpus,
Parameters:
- mainlist_id (starting point, already cleaned of stoplist terms)
- specclusion_id (inclusion by cooc specificity -- ranking factor)
- genclusion_id (inclusion by cooc genericity -- ranking factor)
- specclusion_id (
ngram
inclusion by cooc specificity -- ranking factor)
- genclusion_id (
ngram
inclusion by cooc genericity -- ranking factor)
- grouplist_id (filtering grouped ones)
- overwrite_id: optional if preexisting MAPLIST node to overwrite
+
2 constant
s to modulate the terms choice
+
3 param
s to modulate the terms choice
- limit for the amount of picked terms
- monograms_part: a ratio of terms with only one lexical unit to keep
(multigrams quota = limit * (1-monograms_part))
- genclusion_part: a ratio of terms with only one lexical unit to keep
(speclusion quota = limit * (1-genclusion_part))
'''
if
not
(
mainlist_id
and
specclusion_id
and
genclusion_id
and
grouplist_id
):
raise
ValueError
(
"Please provide mainlist_id, specclusion_id, genclusion_id and grouplist_id"
)
monograms_limit
=
round
(
limit
*
monograms_part
)
multigrams_limit
=
limit
-
monograms_limit
print
(
"MAPLIST: monograms_limit ="
,
monograms_limit
)
print
(
"MAPLIST: multigrams_limit = "
,
multigrams_limit
)
quotas
=
{
'topgen'
:{},
'topspec'
:{}}
genclusion_limit
=
round
(
limit
*
genclusion_part
)
speclusion_limit
=
limit
-
genclusion_limit
quotas
[
'topgen'
][
'monograms'
]
=
round
(
genclusion_limit
*
monograms_part
)
quotas
[
'topgen'
][
'multigrams'
]
=
genclusion_limit
-
quotas
[
'topgen'
][
'monograms'
]
quotas
[
'topspec'
][
'monograms'
]
=
round
(
speclusion_limit
*
monograms_part
)
quotas
[
'topspec'
][
'multigrams'
]
=
speclusion_limit
-
quotas
[
'topspec'
][
'monograms'
]
print
(
"MAPLIST quotas:"
,
quotas
)
#dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id)
...
...
@@ -58,11 +66,19 @@ def do_maplist(corpus,
)
ScoreSpec
=
aliased
(
NodeNgram
)
# specificity-ranked
query
=
(
session
.
query
(
ScoreSpec
.
ngram_id
)
ScoreGen
=
aliased
(
NodeNgram
)
# ngram with both ranking factors spec and gen
query
=
(
session
.
query
(
ScoreSpec
.
ngram_id
,
ScoreSpec
.
weight
,
ScoreGen
.
weight
,
Ngram
.
n
)
.
join
(
Ngram
,
Ngram
.
id
==
ScoreSpec
.
ngram_id
)
.
join
(
ScoreGen
,
ScoreGen
.
ngram_id
==
ScoreSpec
.
ngram_id
)
.
filter
(
ScoreSpec
.
node_id
==
specclusion_id
)
.
filter
(
ScoreGen
.
node_id
==
genclusion_id
)
# we want only terms within mainlist
.
join
(
MainlistTable
,
Ngram
.
id
==
MainlistTable
.
ngram_id
)
...
...
@@ -72,36 +88,96 @@ def do_maplist(corpus,
.
outerjoin
(
IsSubform
,
IsSubform
.
c
.
ngram2_id
==
ScoreSpec
.
ngram_id
)
.
filter
(
IsSubform
.
c
.
ngram2_id
==
None
)
)
# TODO: move these 2 pools up to mainlist selection
top_monograms
=
(
query
.
filter
(
Ngram
.
n
==
1
)
# specificity-ranked
.
order_by
(
desc
(
ScoreSpec
.
weight
))
.
limit
(
monograms_limit
)
.
all
()
)
)
top_multigrams
=
(
query
.
filter
(
Ngram
.
n
>=
2
)
.
order_by
(
desc
(
ScoreSpec
.
weight
))
.
limit
(
multigrams_limit
)
.
all
()
)
obtained_mono
=
len
(
top_monograms
)
obtained_multi
=
len
(
top_multigrams
)
obtained_total
=
obtained_mono
+
obtained_multi
# print("MAPLIST: top_monograms =", obtained_mono)
# print("MAPLIST: top_multigrams = ", obtained_multi)
# format in scored_ngrams array:
# -------------------------------
# [(37723, 8.428, 14.239, 3 ), etc]
# ngramid wspec wgen nwords
scored_ngrams
=
query
.
all
()
n_ngrams
=
len
(
scored_ngrams
)
# results, with same structure as quotas
chosen_ngrams
=
{
'topgen'
:{
'monograms'
:[],
'multigrams'
:[]},
'topspec'
:{
'monograms'
:[],
'multigrams'
:[]}
}
# specificity and genericity are rather reverse-correlated
# but occasionally they can have common ngrams (same ngram well ranked in both)
# => we'll use a lookup table to check if we didn't already get it
already_gotten_ngramids
=
{}
# 2 loops to fill spec-clusion then gen-clusion quotas
# (1st loop uses order from DB, 2nd loop uses our own sort at end of 1st)
for
rkr
in
[
'topspec'
,
'topgen'
]:
got_enough_mono
=
False
got_enough_multi
=
False
all_done
=
False
i
=
-
1
while
((
not
all_done
)
and
(
not
(
got_enough_mono
and
got_enough_multi
))):
# retrieve sorted ngram n° i
i
+=
1
(
ng_id
,
wspec
,
wgen
,
nwords
)
=
scored_ngrams
[
i
]
# before any continue case, we check the next i for max reached
all_done
=
(
i
+
1
>=
n_ngrams
)
if
ng_id
in
already_gotten_ngramids
:
continue
# NB: nwords could be replaced by a simple search on r' '
if
nwords
==
1
:
if
got_enough_mono
:
continue
else
:
# add ngram to results and lookup
chosen_ngrams
[
rkr
][
'monograms'
]
.
append
(
ng_id
)
already_gotten_ngramids
[
ng_id
]
=
True
# multi
else
:
if
got_enough_multi
:
continue
else
:
# add ngram to results and lookup
chosen_ngrams
[
rkr
][
'multigrams'
]
.
append
(
ng_id
)
already_gotten_ngramids
[
ng_id
]
=
True
got_enough_mono
=
(
len
(
chosen_ngrams
[
rkr
][
'monograms'
])
>=
quotas
[
rkr
][
'monograms'
])
got_enough_multi
=
(
len
(
chosen_ngrams
[
rkr
][
'multigrams'
])
>=
quotas
[
rkr
][
'multigrams'
])
# at the end of the first loop we just need to sort all by the second ranker (gen)
scored_ngrams
=
sorted
(
scored_ngrams
,
key
=
lambda
ng_infos
:
ng_infos
[
2
],
reverse
=
True
)
obtained_spec_mono
=
len
(
chosen_ngrams
[
'topspec'
][
'monograms'
])
obtained_spec_multi
=
len
(
chosen_ngrams
[
'topspec'
][
'multigrams'
])
obtained_gen_mono
=
len
(
chosen_ngrams
[
'topgen'
][
'monograms'
])
obtained_gen_multi
=
len
(
chosen_ngrams
[
'topgen'
][
'multigrams'
])
obtained_total
=
obtained_spec_mono
\
+
obtained_spec_multi
\
+
obtained_gen_mono
\
+
obtained_gen_multi
print
(
"MAPLIST: top_spec_monograms ="
,
obtained_spec_mono
)
print
(
"MAPLIST: top_spec_multigrams ="
,
obtained_spec_multi
)
print
(
"MAPLIST: top_gen_monograms ="
,
obtained_gen_mono
)
print
(
"MAPLIST: top_gen_multigrams ="
,
obtained_gen_multi
)
print
(
"MAPLIST: kept
%
i ngrams in total "
%
obtained_total
)
obtained_data
=
chosen_ngrams
[
'topspec'
][
'monograms'
]
\
+
chosen_ngrams
[
'topspec'
][
'multigrams'
]
\
+
chosen_ngrams
[
'topgen'
][
'monograms'
]
\
+
chosen_ngrams
[
'topgen'
][
'multigrams'
]
# NEW MAPLIST NODE
# -----------------
# saving the parameters of the analysis in the Node JSON
new_hyperdata
=
{
'corpus'
:
corpus
.
id
,
'limit'
:
limit
,
'monograms_part'
:
monograms_part
,
'monograms_result'
:
obtained_mono
/
obtained_total
if
obtained_total
!=
0
else
0
'monograms_part'
:
monograms_part
,
'genclusion_part'
:
genclusion_part
,
}
if
overwrite_id
:
# overwrite pre-existing node
...
...
@@ -122,9 +198,7 @@ def do_maplist(corpus,
the_id
=
the_maplist
.
id
# create UnweightedList object and save (=> new NodeNgram rows)
datalist
=
UnweightedList
(
[
res
.
ngram_id
for
res
in
top_monograms
+
top_multigrams
]
)
datalist
=
UnweightedList
(
obtained_data
)
# save
datalist
.
save
(
the_id
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment