Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
dee88be8
Commit
dee88be8
authored
Mar 04, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
first simple version of tfidf in ngram_scores
parent
a65df75a
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
159 additions
and
23 deletions
+159
-23
constants.py
gargantext/constants.py
+16
-15
lists.py
gargantext/util/lists.py
+17
-1
__init__.py
gargantext/util/toolchain/__init__.py
+16
-7
ngram_groups.py
gargantext/util/toolchain/ngram_groups.py
+0
-0
ngram_scores.py
gargantext/util/toolchain/ngram_scores.py
+110
-0
No files found.
gargantext/constants.py
View file @
dee88be8
...
...
@@ -9,29 +9,30 @@ LISTTYPES = {
'STOPLIST'
:
UnweightedList
,
'MAINLIST'
:
UnweightedList
,
'MAPLIST'
:
UnweightedList
,
'OCCURRENCES'
:
Weighted
List
,
'OCCURRENCES'
:
Weighted
ContextIndex
,
'COOCCURRENCES'
:
WeightedMatrix
,
'TFIDF-CORPUS'
:
WeightedContextIndex
,
}
NODETYPES
=
[
None
,
# documents hierarchy
'USER'
,
'PROJECT'
,
'CORPUS'
,
'DOCUMENT'
,
'USER'
,
# 1
'PROJECT'
,
# 2
'CORPUS'
,
# 3
'DOCUMENT'
,
# 4
# lists
'STOPLIST'
,
'GROUPLIST'
,
'MAINLIST'
,
'MAPLIST'
,
'COOCCURRENCES'
,
'STOPLIST'
,
# 5
'GROUPLIST'
,
# 6
'MAINLIST'
,
# 7
'MAPLIST'
,
# 8
'COOCCURRENCES'
,
# 9
# scores
'OCCURRENCES'
,
'SPECIFICITY'
,
'CVALUE'
,
'TFIDF-CORPUS'
,
'TFIDF-GLOBAL'
,
'OCCURRENCES'
,
# 10
'SPECIFICITY'
,
# 11
'CVALUE'
,
# 12
'TFIDF-CORPUS'
,
# 13
'TFIDF-GLOBAL'
,
# 14
]
...
...
gargantext/util/lists.py
View file @
dee88be8
...
...
@@ -2,7 +2,7 @@
"""
__all__
=
[
'Translations'
,
'WeightedMatrix'
,
'UnweightedList'
,
'WeightedList'
]
__all__
=
[
'Translations'
,
'WeightedMatrix'
,
'UnweightedList'
,
'WeightedList'
,
'WeightedContextIndex'
]
from
gargantext.util.db
import
session
,
bulk_insert
...
...
@@ -165,6 +165,22 @@ class Translations(_BaseClass):
)
class
WeightedContextIndex
(
_BaseClass
):
"""
associated model : NodeNodeNgram
associated columns : node1_id | node2_id | ngram_id | score (float)
Tensor representing a contextual index or registry
(matrix of weighted ngrams *per* doc *per* context)
Exemple : tfidf by corpus
"""
def
__init__
(
self
,
source
=
None
):
self
.
items
=
defaultdict
(
float
)
class
WeightedMatrix
(
_BaseClass
):
def
__init__
(
self
,
source
=
None
):
...
...
gargantext/util/toolchain/__init__.py
View file @
dee88be8
from
.parsing
import
parse
from
.parsing
import
parse
from
.ngrams_extraction
import
extract_ngrams
from
.ngram_scores
import
compute_occurrences_local
,
compute_tfidf_local
from
.ngram_groups
import
compute_groups
from
gargantext.util.db
import
session
from
gargantext.models
import
Node
from
.group
import
compute_groups
from
gargantext.models
import
Node
def
parse_extract
(
corpus
):
# retrieve corpus from database from id
...
...
@@ -23,5 +22,15 @@ def parse_extract(corpus):
print
(
'CORPUS #
%
d: extracted ngrams'
%
(
corpus
.
id
))
# temporary ngram lists workflow
group_id
=
compute_groups
(
corpus
)
print
(
'CORPUS #
%
d: new grouplist = #
%
i'
%
(
corpus
.
id
,
group_id
))
# write occurrences to Node and NodeNodeNgram
occnd_id
=
compute_occurrences_local
(
corpus
)
print
(
'CORPUS #
%
d: new occs node #
%
i'
%
(
corpus
.
id
,
occnd_id
))
# write local tfidf to Node and NodeNodeNgram
ltfidf_id
=
compute_tfidf_local
(
corpus
)
print
(
'CORPUS #
%
d: new localtfidf node #
%
i'
%
(
corpus
.
id
,
ltfidf_id
))
# write groups to Node and NodeNgramNgram
group_id
=
compute_groups
(
corpus
,
stoplist_id
=
None
)
print
(
'CORPUS #
%
d: new grouplist node #
%
i'
%
(
corpus
.
id
,
group_id
))
gargantext/util/toolchain/
group
.py
→
gargantext/util/toolchain/
ngram_groups
.py
View file @
dee88be8
File moved
gargantext/util/toolchain/ngram_scores.py
0 → 100644
View file @
dee88be8
from
gargantext.models
import
Node
,
NodeNgram
,
NodeNodeNgram
from
gargantext.util.db
import
session
,
bulk_insert
# £TODO
# from gargantext.util.lists import WeightedContextIndex
from
gargantext.util.db
import
func
# = sqlalchemy.func like sum() or count()
from
math
import
log
def
compute_occurrences_local
(
corpus
):
"""
Calculates sum of occs per ngram within corpus
"""
# 1) all the doc_ids of our corpus (scope of counts for filter)
# slower alternative: [doc.id for doc in corpus.children('DOCUMENT').all()]
docids_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
subquery
()
)
# 2) our sums per ngram_id
occ_sums
=
(
session
.
query
(
NodeNgram
.
ngram_id
,
func
.
sum
(
NodeNgram
.
weight
)
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
docids_subquery
))
.
group_by
(
NodeNgram
.
ngram_id
)
.
all
()
)
# example result = [(1970, 1.0), (2024, 2.0), (259, 2.0), (302, 1.0), ... ]
# ^^^^ ^^^
# ngram_id sum_wei
# create the new OCCURRENCES node
occnode
=
Node
()
occnode
.
typename
=
"OCCURRENCES"
occnode
.
name
=
"occ_sums (in:
%
s)"
%
corpus
.
id
occnode
.
parent_id
=
corpus
.
id
occnode
.
user_id
=
corpus
.
user_id
session
.
add
(
occnode
)
session
.
commit
()
# reflect that in NodeNodeNgrams (could be NodeNgram but harmony with tfidf)
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert
(
NodeNodeNgram
,
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
((
occnode
.
id
,
corpus
.
id
,
res
[
0
],
res
[
1
])
for
res
in
occ_sums
)
)
return
occnode
.
id
def
compute_tfidf_local
(
corpus
):
"""
Calculates tfidf within the current corpus
"""
# ?? FIXME could we keep the docids somehow from previous computations ??
docids_subquery
=
(
session
.
query
(
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
"DOCUMENT"
)
.
subquery
()
)
total_docs
=
session
.
query
(
docids_subquery
)
.
count
()
# or perhaps at least do the occurrences right now at the same time
tf_nd
=
(
session
.
query
(
NodeNgram
.
ngram_id
,
func
.
sum
(
NodeNgram
.
weight
),
# tf: same as occnode
func
.
count
(
NodeNgram
.
node_id
)
# nd: n docs with term
)
.
filter
(
NodeNgram
.
node_id
.
in_
(
docids_subquery
))
.
group_by
(
NodeNgram
.
ngram_id
)
.
all
()
)
# ---------------------------------------------
tfidfs
=
{}
for
(
ngram_id
,
tf
,
nd
)
in
tf_nd
:
tfidfs
[
ngram_id
]
=
tf
/
log
(
total_docs
/
nd
)
# ---------------------------------------------
# create the new TFIDF-CORPUS node
ltfidf
=
Node
()
ltfidf
.
typename
=
"TFIDF-CORPUS"
ltfidf
.
name
=
"tfidf (in:
%
s)"
%
corpus
.
id
ltfidf
.
parent_id
=
corpus
.
id
ltfidf
.
user_id
=
corpus
.
user_id
session
.
add
(
ltfidf
)
session
.
commit
()
# reflect that in NodeNodeNgrams
# £TODO replace bulk_insert by something like WeightedContextMatrix.save()
bulk_insert
(
NodeNodeNgram
,
(
'node1_id'
,
'node2_id'
,
'ngram_id'
,
'score'
),
((
ltfidf
.
id
,
corpus
.
id
,
ng
,
tfidfs
[
ng
])
for
ng
in
tfidfs
)
)
return
ltfidf
.
id
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment