Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
43ea9c0d
Commit
43ea9c0d
authored
Jul 20, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] import/export terms table: previously unindexed ngrams are indexed at import
parent
57a097f5
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
133 additions
and
6 deletions
+133
-6
ngramlists_tools.py
gargantext/util/ngramlists_tools.py
+21
-6
ngrams_addition.py
gargantext/util/toolchain/ngrams_addition.py
+112
-0
No files found.
gargantext/util/ngramlists_tools.py
View file @
43ea9c0d
...
@@ -21,6 +21,9 @@ from gargantext.constants import DEFAULT_CSV_DELIM, DEFAULT_CSV_DELIM_GRO
...
@@ -21,6 +21,9 @@ from gargantext.constants import DEFAULT_CSV_DELIM, DEFAULT_CSV_DELIM_GRO
from
gargantext.util.toolchain.parsing
import
normalize_chars
from
gargantext.util.toolchain.parsing
import
normalize_chars
from
gargantext.util.toolchain.ngrams_extraction
import
normalize_forms
from
gargantext.util.toolchain.ngrams_extraction
import
normalize_forms
# merge will also index the new ngrams in the docs of the corpus
from
gargantext.util.toolchain.ngrams_addition
import
index_new_ngrams
from
sqlalchemy.sql
import
exists
from
sqlalchemy.sql
import
exists
from
os
import
path
from
os
import
path
from
csv
import
writer
,
reader
,
QUOTE_MINIMAL
from
csv
import
writer
,
reader
,
QUOTE_MINIMAL
...
@@ -483,7 +486,7 @@ def import_ngramlists(fname, delimiter=DEFAULT_CSV_DELIM,
...
@@ -483,7 +486,7 @@ def import_ngramlists(fname, delimiter=DEFAULT_CSV_DELIM,
this_row_forms
=
''
this_row_forms
=
''
# string normalizations
# string normalizations
this_row_label
=
normalize_
te
rms
(
normalize_chars
(
this_row_label
))
this_row_label
=
normalize_
fo
rms
(
normalize_chars
(
this_row_label
))
# except:
# except:
# if i == 0:
# if i == 0:
...
@@ -521,7 +524,7 @@ def import_ngramlists(fname, delimiter=DEFAULT_CSV_DELIM,
...
@@ -521,7 +524,7 @@ def import_ngramlists(fname, delimiter=DEFAULT_CSV_DELIM,
for
raw_term_str
in
this_row_forms
.
split
(
group_delimiter
):
for
raw_term_str
in
this_row_forms
.
split
(
group_delimiter
):
# each subform is also like an ngram declaration
# each subform is also like an ngram declaration
term_str
=
normalize_
te
rms
(
normalize_chars
(
raw_term_str
))
term_str
=
normalize_
fo
rms
(
normalize_chars
(
raw_term_str
))
imported_unique_ngramstrs
[
term_str
]
=
True
imported_unique_ngramstrs
[
term_str
]
=
True
imported_nodes_ngrams
[
this_list_type
]
.
append
(
term_str
)
imported_nodes_ngrams
[
this_list_type
]
.
append
(
term_str
)
...
@@ -559,6 +562,7 @@ def import_ngramlists(fname, delimiter=DEFAULT_CSV_DELIM,
...
@@ -559,6 +562,7 @@ def import_ngramlists(fname, delimiter=DEFAULT_CSV_DELIM,
# print(new_ngrams_ids)
# print(new_ngrams_ids)
# print(imported_nodes_ngrams)
# print(imported_nodes_ngrams)
# ======== Import into lists =========
# ======== Import into lists =========
# 3 x abstract lists + 1 translations
# 3 x abstract lists + 1 translations
...
@@ -632,11 +636,8 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
...
@@ -632,11 +636,8 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
seront remis dans la main à la fin)
seront remis dans la main à la fin)
NB: Uses group_tools.group_union() to merge the synonym links.
NB: Uses group_tools.group_union() to merge the synonym links.
Uses ngrams_addition.index_new_ngrams() to also add new ngrams to the docs
FIXME: new terms created at import_ngramlists() can now be added to lists
but are never added to docs
"""
"""
# log to send back to client-side (lines will be joined)
# log to send back to client-side (lines will be joined)
my_log
=
[]
my_log
=
[]
...
@@ -656,6 +657,20 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
...
@@ -656,6 +657,20 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
{
'key'
:
'map'
,
'name'
:
"MAPLIST"
}
# lid = 2
{
'key'
:
'map'
,
'name'
:
"MAPLIST"
}
# lid = 2
]
]
# ======== Index the new ngrams in the docs =========
all_possibly_new_ngram_ids
=
[]
collect
=
all_possibly_new_ngram_ids
.
append
for
lid
,
info
in
enumerate
(
linfos
):
list_type
=
info
[
'key'
]
if
list_type
in
new_lists
:
for
ng_id
in
new_lists
[
list_type
]
.
items
:
collect
(
ng_id
)
n_added
=
index_new_ngrams
(
all_possibly_new_ngram_ids
,
onto_corpus
)
my_log
.
append
(
"MERGE: added
%
i new ngram occurrences in docs"
%
n_added
)
# ======== Get the old lists =========
# ======== Get the old lists =========
old_lists
=
{}
old_lists
=
{}
...
...
gargantext/util/toolchain/ngrams_addition.py
0 → 100644
View file @
43ea9c0d
"""
Module for raw indexing a totally new ngram
=> creates new (doc_node <-> new_ngram) relations in NodeNgram
use cases:
- from annotation view user selects a free segment of text to make a new ngram
- at list import, any new list can contain ngrams that've never been extracted
prerequisite:
- normalize_chars(new_ngram_str)
- normalize_form(new_ngram_str)
- add the new ngram to `ngrams` table
procedure:
- simple regexp search of the ngram string => addition to NodeNgram
/!
\
-> morphological variants are NOT considered (ex plural or declined forms)
"""
from
gargantext.models
import
Ngram
,
Node
,
NodeNgram
from
gargantext.util.db
import
session
,
bulk_insert
from
sqlalchemy
import
distinct
from
re
import
findall
,
IGNORECASE
# TODO from gargantext.constants import LIST_OF_KEYS_TO_INDEX = title, abstract
def
index_new_ngrams
(
ngram_ids
,
corpus
,
keys
=
(
'title'
,
'abstract'
,
)):
"""
Find occurrences of some ngrams for every document of the given corpus.
+ insert them in the NodeNgram table.
@param ngram_ids: a list of ids for Ngram objects
(we assume they already went throught normalizations
and they were already added to Ngrams table
and optionally to some of the lists like MAPLIST)
(but we can't know if they were previously indexed in the corpus)
@param corpus: the CORPUS node
@param keys: the hyperdata fields to index
"""
# check the ngrams we won't process (those that were already indexed)
indexed_ngrams_subquery
=
(
session
.
query
(
distinct
(
NodeNgram
.
ngram_id
))
.
join
(
Node
,
Node
.
id
==
NodeNgram
.
node_id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
)
.
filter
(
Node
.
typename
==
'DOCUMENT'
)
.
subquery
()
)
# retrieve the ngrams from our list, filtering out the already indexed ones
todo_ngrams
=
(
session
.
query
(
Ngram
)
.
filter
(
Ngram
.
id
.
in_
(
ngram_ids
))
.
filter
(
~
Ngram
.
id
.
in_
(
indexed_ngrams_subquery
))
.
all
()
)
# initialize result dict
node_ngram_to_write
=
{}
# loop throught the docs and their text fields
for
doc
in
corpus
.
children
(
'DOCUMENT'
):
# a new empty counting subdict
node_ngram_to_write
[
doc
.
id
]
=
{}
for
key
in
keys
:
# a text field
text
=
doc
.
hyperdata
.
get
(
key
,
None
)
if
not
isinstance
(
text
,
str
):
# print("WARN: doc %i has no text in field %s" % (doc.id, key))
continue
for
ngram
in
todo_ngrams
:
# build regexp : "british" => r'\bbritish\b'
ngram_re
=
r'\b
%
s\b'
%
ngram
.
terms
# --------------------------------------- find ---
n_occs
=
len
(
findall
(
ngram_re
,
text
,
IGNORECASE
))
# -----------------------------------------------
# save the count results
if
n_occs
>
0
:
if
ngram
.
id
not
in
node_ngram_to_write
[
doc
.
id
]:
node_ngram_to_write
[
doc
.
id
][
ngram
.
id
]
=
n_occs
else
:
node_ngram_to_write
[
doc
.
id
][
ngram
.
id
]
+=
n_occs
# integrate all at the end
my_new_rows
=
[]
add_new_row
=
my_new_rows
.
append
for
doc_id
in
node_ngram_to_write
:
for
ngram_id
in
node_ngram_to_write
[
doc_id
]:
wei
=
node_ngram_to_write
[
doc_id
][
ngram_id
]
add_new_row
([
doc_id
,
ngram_id
,
wei
])
del
node_ngram_to_write
bulk_insert
(
table
=
NodeNgram
,
fields
=
(
'node_id'
,
'ngram_id'
,
'weight'
),
data
=
my_new_rows
)
n_added
=
len
(
my_new_rows
)
print
(
"index_new_ngrams: added
%
i new NodeNgram rows"
%
n_added
)
return
n_added
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment