Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
b7a9fdbc
Commit
b7a9fdbc
authored
Sep 07, 2017
by
sim
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Clean code and better debugging output
parent
3db7968b
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
30 additions
and
24 deletions
+30
-24
ngramlists_tools.py
gargantext/util/ngramlists_tools.py
+30
-24
No files found.
gargantext/util/ngramlists_tools.py
View file @
b7a9fdbc
...
...
@@ -8,8 +8,7 @@ Tools to work with ngramlists (MAINLIST, MAPLIST, STOPLIST)
"""
from
gargantext.util.group_tools
import
query_groups
,
group_union
from
gargantext.util.db
import
session
,
desc
,
func
,
\
bulk_insert_ifnotexists
from
gargantext.util.db
import
session
,
bulk_insert_ifnotexists
from
gargantext.models
import
Ngram
,
NodeNgram
,
NodeNodeNgram
,
\
NodeNgramNgram
,
Node
...
...
@@ -25,7 +24,6 @@ from gargantext.util.toolchain.ngrams_extraction import normalize_forms
# merge will also index the new ngrams in the docs of the corpus
from
gargantext.util.toolchain.ngrams_addition
import
index_new_ngrams
from
sqlalchemy.sql
import
exists
from
os
import
path
from
csv
import
writer
,
reader
,
QUOTE_MINIMAL
from
collections
import
defaultdict
...
...
@@ -179,9 +177,7 @@ def ngrams_to_csv_rows(ngram_objs, ngram_dico={}, group_infos={},
# 3 columns = |status, | mainform, | forms
# (type_of_list) ( term ) ( subterm1|&|subterm2 )
csv_rows
.
append
(
[
list_type
,
ng_obj
.
terms
,
this_grouped_terms
]
)
csv_rows
.
append
([
list_type
,
ng_obj
.
terms
,
this_grouped_terms
])
return
csv_rows
...
...
@@ -391,6 +387,9 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
NB: To merge the imported lists into a corpus node's lists,
chain this function with merge_ngramlists()
'''
list_types
=
[
'stop'
,
'main'
,
'map'
]
# ---------------
# ngram storage
# ---------------
...
...
@@ -455,7 +454,6 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
# headers
if
i
==
0
:
n_cols
=
len
(
csv_row
)
for
j
,
colname
in
enumerate
(
csv_row
):
if
colname
in
[
'label'
,
'status'
,
'forms'
]:
columns
[
colname
]
=
j
...
...
@@ -502,31 +500,30 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
continue
# --- check correct list type
if
not
this_list_type
in
[
'stop'
,
'main'
,
'map'
]
:
if
not
this_list_type
in
list_types
:
print
(
"IMPORT WARN: (skip line) wrong list type at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
continue
# subforms can be duplicated (in forms and another label)
# but we must take care of unwanted other duplicates too
if
this_row_label
in
imported_unique_ngramstrs
:
print
(
"TODO IMPORT DUPL: (skip line) term appears more than once at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
if
imported_unique_ngramstrs
.
get
(
this_row_label
)
==
1
:
print
(
"TODO IMPORT DUPL: (skip line) term
%
r
appears more than once at CSV
%
s:l.
%
i"
%
(
this_row_label
,
fname
,
i
))
# ================= Store the data ====================
# the ngram census
imported_unique_ngramstrs
[
this_row_label
]
=
True
imported_unique_ngramstrs
[
this_row_label
]
=
1
# and the "list to ngram" relation
imported_nodes_ngrams
[
this_list_type
]
.
append
(
this_row_label
)
# ====== Store synonyms from the import (if any) ======
if
len
(
this_row_forms
)
!=
0
:
other_terms
=
[]
for
raw_term_str
in
this_row_forms
.
split
(
group_delimiter
):
# each subform is also like an ngram declaration
term_str
=
normalize_forms
(
normalize_chars
(
raw_term_str
))
imported_unique_ngramstrs
[
term_str
]
=
True
imported_unique_ngramstrs
[
term_str
]
=
2
imported_nodes_ngrams
[
this_list_type
]
.
append
(
term_str
)
# the optional repeated mainform doesn't interest us
...
...
@@ -604,7 +601,10 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
%
(
n_total_ng
,
n_added_ng
,
n_total_ng
-
n_added_ng
)
)
print
(
"IMPORT: read
%
i grouping relations"
%
n_group_relations
)
# print("IMPORT RESULT", result)
list_counts
=
[(
typ
,
len
(
result
.
get
(
typ
)))
for
typ
in
list_types
]
list_counts
.
append
((
'total'
,
sum
(
x
[
1
]
for
x
in
list_counts
)))
print
(
"IMPORT: "
+
'; '
.
join
(
'
%
s
%
s'
%
stats
for
stats
in
list_counts
))
return
result
def
merge_ngramlists
(
new_lists
=
{},
onto_corpus
=
None
,
del_originals
=
[]):
...
...
@@ -712,9 +712,11 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
# ======== Merging all involved ngrams =========
# all memberships with resolved conflicts of interfering memberships
# all ngram memberships with resolved conflicts of interfering memberships
# (associates ngram ids with list types -- see linfos definition above)
resolved_memberships
=
{}
# iterates over each ngram of each list type for both old and new lists
for
list_set
in
[
old_lists
,
new_lists
]:
for
lid
,
info
in
enumerate
(
linfos
):
list_type
=
info
[
'key'
]
...
...
@@ -744,11 +746,11 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
# ======== Merging old and new groups =========
# get the arcs already in the target DB (directed couples)
previous_links
=
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
old_group_id
)
.
all
()
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
old_group_id
)
.
all
()
n_links_previous
=
len
(
previous_links
)
...
...
@@ -816,7 +818,7 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
list_type
=
linfos
[
lid
][
'key'
]
merged_results
[
list_type
]
.
items
.
add
(
ng_id
)
#
print("IMPORT: added %i elements in the lists indices" % added_nd_ng)
print
(
"IMPORT: added
%
i elements in the lists indices"
%
added_nd_ng
)
# ======== Overwrite old data with new =========
for
lid
,
info
in
enumerate
(
linfos
):
...
...
@@ -839,10 +841,14 @@ def import_and_merge_ngramlists(file_contents, onto_corpus_id, overwrite=False):
"""
A single function to run import_ngramlists and merge_ngramlists together
"""
print
(
"import list"
)
print
(
"IMPORT CSV termlists file with
%
s lines in corpus
%
s (
%
s)"
%
(
len
(
file_contents
),
onto_corpus_id
,
'overwrite'
if
overwrite
else
'merge'
))
new_lists
=
import_ngramlists
(
file_contents
)
corpus_node
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
onto_corpus_id
)
.
first
(
)
corpus_node
=
session
.
query
(
Node
)
.
get
(
onto_corpus_id
)
# merge the new_lists onto those of the target corpus
del_originals
=
[
'stop'
,
'main'
,
'map'
]
if
overwrite
else
[]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment