Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
1bb37aff
Commit
1bb37aff
authored
May 18, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
occurrences with subforms counted inside mainform
parent
f280e8a4
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
79 additions
and
31 deletions
+79
-31
metric_tfidf.py
gargantext/util/toolchain/metric_tfidf.py
+79
-31
No files found.
gargantext/util/toolchain/metric_tfidf.py
View file @
1bb37aff
...
@@ -11,57 +11,105 @@ FIXME: "having the same source" means we need to select inside hyperdata
...
@@ -11,57 +11,105 @@ FIXME: "having the same source" means we need to select inside hyperdata
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
,
NodeNgramNgram
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
,
NodeNgramNgram
from
gargantext.util.db
import
session
,
bulk_insert
,
func
# = sqlalchemy.func like sum() or count()
from
gargantext.util.db
import
session
,
bulk_insert
,
func
# = sqlalchemy.func like sum() or count()
from
sqlalchemy
import
text
# for query from raw SQL statement
from
sqlalchemy
import
text
# for query from raw SQL statement
from
sqlalchemy.sql.expression
import
case
# for choice if ngram has mainform or not
from
math
import
log
from
math
import
log
# £TODO
# £TODO
# from gargantext.util.lists import WeightedContextIndex
# from gargantext.util.lists import WeightedContextIndex
def
compute_occs
(
corpus
,
overwrite_id
=
None
):
def
compute_occs
(
corpus
,
overwrite_id
=
None
,
groupings_id
=
None
,
):
"""
"""
# TODO check if cumulated occs correspond to app's use cases and intention
Calculates sum of occs per ngram (or per mainform if groups) within corpus
Calculates sum of occs per ngram within corpus
(used as info in the ngrams table view)
(used as info in the ngrams table view)
? optimize ? OCCS here could be calculated simultaneously within TFIDF-CORPUS loop
? optimize ? OCCS here could be calculated simultaneously within TFIDF-CORPUS loop
? use cases ?
=> not the main score for users (their intuition for nb of docs having word)
=> but is the main weighting value for any NLP task
Parameters:
Parameters:
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
- overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus
(the Node and its previous NodeNodeNgram rows will be replaced)
(the Node and its previous NodeNodeNgram rows will be replaced)
- groupings_id: optional id of a GROUPLIST node for this corpus
IF absent the occurrences are the sums for each ngram
IF present they're the sums for each ngram's mainform
"""
"""
# 0) Get the groups
# simple case : no groups
group_id
=
(
session
.
query
(
Node
.
id
)
# ---------
# (the occurrences are the sums for each ngram)
if
not
groupings_id
:
# NodeNgram index
occs_q
=
(
session
.
query
(
NodeNgram
.
ngram_id
,
func
.
sum
(
NodeNgram
.
weight
)
# <== OCCURRENCES
)
# filter docs within corpus
.
join
(
Node
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"GROUPLIST"
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
first
()
# for the sum
.
group_by
(
NodeNgram
.
ngram_id
)
)
)
# 1) all the doc_ids of our corpus (scope of counts for filter)
# difficult case: with groups
# slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
# ------------
docids_subquery
=
(
session
# (the occurrences are the sums for each ngram's mainform)
.
query
(
Node
.
id
)
else
:
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
# sub-SELECT the synonyms of this GROUPLIST id (for OUTER JOIN later)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
syn
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
groupings_id
)
.
subquery
()
.
subquery
()
)
)
# 2) our sums per ngram_id
# NodeNgram index with additional subform => mainform replacement
occ_sums
=
(
session
occs_q
=
(
session
.
query
(
.
query
(
NodeNgram
.
ngram_id
,
# intermediate columns for debug
func
.
sum
(
NodeNgram
.
weight
)
# -------------------------------
# NodeNgram.node_id, # document
# NodeNgram.ngram_id, # <= the occurring ngram
# NodeNgram.weight, # <= its frequency in doc
# syn.c.ngram1_id # mainform
# syn.c.ngram2_id, # subform
# ngram to count aka counted_form
# ----------------------------------
# either NodeNgram.ngram_id as before
# or mainform if it exists
case
([(
syn
.
c
.
ngram1_id
!=
None
,
syn
.
c
.
ngram1_id
)],
else_
=
NodeNgram
.
ngram_id
)
.
label
(
"counted_form"
),
# the sum itself
# --------------
func
.
sum
(
NodeNgram
.
weight
)
# <== OCCURRENCES
)
)
#.join(NodeNgramNgram, NodeNgramNgram.node_id == group_id)
# this brings the mainform if NodeNgram.ngram_id has one in syn
.
filter
(
NodeNgram
.
node_id
.
in_
(
docids_subquery
))
.
outerjoin
(
syn
,
.
group_by
(
NodeNgram
.
ngram_id
)
syn
.
c
.
ngram2_id
==
NodeNgram
.
ngram_id
)
.
all
()
# filter docs within corpus
.
join
(
Node
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
# for the sum
.
group_by
(
"counted_form"
)
)
)
occ_sums
=
occs_q
.
all
()
# example result = [(1970, 1.0), (2024, 2.0), (259, 2.0), (302, 1.0), ... ]
# example result = [(1970, 1.0), (2024, 2.0), (259, 2.0), (302, 1.0), ... ]
# ^^^^ ^^^
# ^^^^ ^^^
# ngram_id sum_wei
# ngram_id sum_wei
# OR
# counted_form
if
overwrite_id
:
if
overwrite_id
:
# overwrite pre-existing id
# overwrite pre-existing id
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment