Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
eee27166
Commit
eee27166
authored
May 22, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
finished ngram workflow with groups (todo 'recount' button after changing groups
parent
95763e12
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
95 additions
and
26 deletions
+95
-26
ngram_parsing_flow.dot
doc/schemas/ngram_parsing_flow.dot
+8
-4
ngram_parsing_flow.png
doc/schemas/ngram_parsing_flow.png
+0
-0
__init__.py
gargantext/util/toolchain/__init__.py
+3
-2
metric_tfidf.py
gargantext/util/toolchain/metric_tfidf.py
+83
-19
ngram_coocs.py
gargantext/util/toolchain/ngram_coocs.py
+1
-1
No files found.
doc/schemas/ngram_parsing_flow.dot
View file @
eee27166
...
...
@@ -6,15 +6,19 @@ digraph ngramflow {
labelloc
=
"t"
;
"extracted_ngrams"
->
"grouplist"
;
"extracted_ngrams"
->
"occs+t
fidfs
"
;
"
main_user_stoplist
"
->
"stoplist"
;
"extracted_ngrams"
->
"occs+t
i_rank
"
;
"
project stoplist (todo)
"
->
"stoplist"
;
"stoplist"
->
"mainlist"
;
"occs+t
fidfs"
->
"mainlist"
[
label
=
" TFIDF
_LIMIT"
]
;
"occs+t
i_rank"
->
"mainlist"
[
label
=
" TI_RANK
_LIMIT"
]
;
"mainlist"
->
"coocs"
[
label
=
" COOCS_THRESHOLD"
]
;
"coocs"
->
"specificity"
;
"specificity"
->
"maplist"
[
label
=
"MAPLIST_LIMIT\nMONOGRAM_PART"
]
;
"mainlist"
->
"tfidf"
;
"tfidf"
->
"explore"
[
label
=
"doc relations with all map and candidates"
]
;
"maplist"
->
"explore"
;
"grouplist"
->
"maplist"
;
"grouplist"
->
"occs+ti_rank"
;
"grouplist"
->
"coocs"
;
"grouplist"
->
"tfidf"
;
}
doc/schemas/ngram_parsing_flow.png
100755 → 100644
View replaced file @
95763e12
View file @
eee27166
52.5 KB
|
W:
|
H:
75.9 KB
|
W:
|
H:
2-up
Swipe
Onion skin
gargantext/util/toolchain/__init__.py
View file @
eee27166
...
...
@@ -129,8 +129,9 @@ def parse_extract_indexhyperdata(corpus):
print
(
'CORPUS #
%
d: [
%
s] new mainlist node #
%
i'
%
(
corpus
.
id
,
t
(),
mainlist_id
))
# -> write local tfidf similarities to Node and NodeNodeNgram
# TODO only on mainlist
ltfidf_id
=
compute_tfidf_local
(
corpus
)
ltfidf_id
=
compute_tfidf_local
(
corpus
,
on_list_id
=
mainlist_id
,
groupings_id
=
group_id
)
print
(
'CORPUS #
%
d: [
%
s] new localtfidf node #
%
i'
%
(
corpus
.
id
,
t
(),
ltfidf_id
))
# => used for doc <=> ngram association
...
...
gargantext/util/toolchain/metric_tfidf.py
View file @
eee27166
...
...
@@ -377,12 +377,18 @@ def compute_ti_ranking(corpus,
def
compute_tfidf_local
(
corpus
,
overwrite_id
=
None
):
def
compute_tfidf_local
(
corpus
,
on_list_id
=
None
,
groupings_id
=
None
,
overwrite_id
=
None
):
"""
Calculates tfidf similarity of each (doc, ngram) couple, within the current corpus
Parameters:
- the corpus itself
- groupings_id: optional synonym relations to add all subform counts
with their mainform's counts
- on_list_id: mainlist or maplist type, to constrain the input ngrams
- overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
"""
...
...
@@ -398,36 +404,94 @@ def compute_tfidf_local(corpus, overwrite_id=None):
# N
total_docs
=
session
.
query
(
docids_subquery
)
.
count
()
# number of docs with given term (number of rows = M ngrams)
n_docswith_ng
=
(
session
.
query
(
NodeNgram
.
ngram_id
,
func
.
count
(
NodeNgram
.
node_id
)
.
label
(
"nd"
)
# nd: n docs with term
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
docids_subquery
))
.
group_by
(
NodeNgram
.
ngram_id
)
.
all
()
)
# { ngram_id => log(nd) }
log_nd_lookup
=
{
row
.
ngram_id
:
log
(
row
.
nd
)
for
row
in
n_docswith_ng
}
# define the counted form
if
not
groupings_id
:
ngform_id
=
NodeNgram
.
ngram_id
else
:
Syno
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
groupings_id
)
.
subquery
()
)
ngform_id
=
case
([
(
Syno
.
c
.
ngram1_id
!=
None
,
Syno
.
c
.
ngram1_id
),
(
Syno
.
c
.
ngram1_id
==
None
,
NodeNgram
.
ngram_id
)
])
# tf for each couple (number of rows = N docs X M ngrams)
tf_doc_
ng
=
(
session
tf_doc_
query
=
(
session
.
query
(
NodeNgram
.
ngra
m_id
,
ngfor
m_id
,
NodeNgram
.
node_id
,
func
.
sum
(
NodeNgram
.
weight
)
.
label
(
"tf"
),
# tf: occurrences
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
docids_subquery
))
.
group_by
(
NodeNgram
.
node_id
,
NodeNgram
.
ngram_id
)
.
all
()
# select within docs of current corpus
.
join
(
docids_subquery
,
docids_subquery
.
c
.
id
==
NodeNgram
.
node_id
)
)
if
groupings_id
:
tf_doc_query
=
(
tf_doc_query
.
outerjoin
(
Syno
,
Syno
.
c
.
ngram2_id
==
NodeNgram
.
ngram_id
)
)
# now when we'll group_by the ngram2 freqs will be added to ngram1
if
on_list_id
:
Miamlist
=
aliased
(
NodeNgram
)
tf_doc_query
=
(
tf_doc_query
.
join
(
Miamlist
,
Miamlist
.
ngram_id
==
ngform_id
)
.
filter
(
Miamlist
.
node_id
==
on_list_id
)
)
# execute query to do our tf sum
tf_per_doc
=
tf_doc_query
.
group_by
(
NodeNgram
.
node_id
,
ngform_id
)
.
all
()
# ex: [(128371, 9732, 1.0),
# (128383, 9740, 1.0),
# (128373, 9731, 1.0),
# (128376, 9734, 1.0),
# (128372, 9731, 1.0),
# (128383, 9733, 1.0),
# (128383, 9735, 1.0),
# (128389, 9734, 1.0),
# (8624, 9731, 1.0),
# (128382, 9740, 1.0),
# (128383, 9739, 1.0),
# (128383, 9736, 1.0),
# (128378, 9735, 1.0),
# (128375, 9733, 4.0),
# (128383, 9732, 1.0)]
# ^ ^ ^^ ^^
# ngram doc freq in this doc
# simultaneously count docs with given term (number of rows = M ngrams)
ndocswithngram
=
{}
for
triple
in
tf_per_doc
:
ng
=
triple
[
0
]
doc
=
triple
[
1
]
if
ng
in
ndocswithngram
:
ndocswithngram
[
ng
]
+=
1
else
:
ndocswithngram
[
ng
]
=
1
# print(ndocswithngram)
# store for use in formula
# { ngram_id => log(nd) }
log_nd_lookup
=
{
ng
:
log
(
nd_count
)
for
(
ng
,
nd_count
)
in
ndocswithngram
.
items
()}
# ---------------------------------------------------------
tfidfs
=
{}
log_tot_docs
=
log
(
total_docs
)
for
(
ngram_id
,
node_id
,
tf
)
in
tf_
doc_ng
:
for
(
ngram_id
,
node_id
,
tf
)
in
tf_
per_doc
:
log_nd
=
log_nd_lookup
[
ngram_id
]
# tfidfs[ngram_id] = tf * log(total_docs/nd)
tfidfs
[
node_id
,
ngram_id
]
=
tf
*
(
log_tot_docs
-
log_nd
)
...
...
gargantext/util/toolchain/ngram_coocs.py
View file @
eee27166
...
...
@@ -18,7 +18,7 @@ def compute_coocs( corpus,
stoplist_id
=
None
,
start
=
None
,
end
=
None
,
symmetry_filter
=
Tru
e
):
symmetry_filter
=
Fals
e
):
"""
Count how often some extracted terms appear
together in a small context (document)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment