Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
0acab158
Commit
0acab158
authored
Jun 15, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
import terms table: merge function ok
parent
99316373
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
65 additions
and
26 deletions
+65
-26
group_tools.py
gargantext/util/group_tools.py
+1
-0
ngramlists_tools.py
gargantext/util/ngramlists_tools.py
+64
-26
No files found.
gargantext/util/group_tools.py
View file @
0acab158
...
...
@@ -110,6 +110,7 @@ def group_union(g_a_links, g_b_links):
rloth: this is some slightly amended code
from Samuel's in rest_v1_0.ngrams.Group.get
TODO use "most frequent" score if leader candidates are ex aequo by degree.
"""
# output: list of links forming new group
...
...
gargantext/util/ngramlists_tools.py
View file @
0acab158
...
...
@@ -2,7 +2,9 @@
Tools to work with ngramlists (MAINLIST, MAPLIST, STOPLIST)
- query_list(list_id) to retrieve ngrams
- export_ngramlists(corpus_id)
- export_ngramlists(corpus_node)
- import_ngramlists(corpus_node)
- merge_ngramlists(new_lists, onto_corpus = corpus_node)
"""
from
gargantext.util.group_tools
import
query_groups
,
group_union
...
...
@@ -13,6 +15,10 @@ from gargantext.models import Ngram, NodeNgram, NodeNodeNgram, \
from
gargantext.util.lists
import
UnweightedList
,
Translations
# import will implement the same text cleaning procedures as toolchain
from
gargantext.util.toolchain.parsing
import
normalize_chars
from
gargantext.util.toolchain.ngrams_extraction
import
normalize_terms
from
sqlalchemy.sql
import
exists
from
os
import
path
from
csv
import
writer
,
reader
,
QUOTE_MINIMAL
...
...
@@ -323,7 +329,7 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
chain this function with merge_ngramlists()
'''
# --------------
#
READ CSV
#
# --------------
# main storage for the ngrams by list
...
...
@@ -342,6 +348,10 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
# (ie imported ids.. that will have to be translated
# to target db ids)
# skipped lines can (very rarely) be used in groups => mark as ignored
ignored_oldids
=
[]
# =============== READ CSV ===============
fh
=
open
(
fname
,
"r"
)
ngrams_csv_rows
=
reader
(
fh
,
delimiter
=
delimiter
,
...
...
@@ -366,25 +376,37 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
this_list_type
=
str
(
csv_row
[
3
])
this_ng_group
=
str
(
csv_row
[
4
])
# string normalizations
this_ng_term
=
normalize_terms
(
normalize_chars
(
this_ng_term
))
except
:
if
i
==
0
:
print
(
"WARN: (skip line) probable header line at CSV
%
s:l.0"
%
fname
)
continue
# --- check format before any old ID retrieve
if
not
match
(
"
\
d+$"
,
this_ng_oldid
):
print
(
"WARN: (skip line) bad ID at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
continue
else
:
this_ng_oldid
=
int
(
this_ng_oldid
)
# --- term checking
if
not
len
(
this_ng_term
)
>
0
:
print
(
"WARN: (skip line) empty term at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
ignored_oldids
.
append
(
this_ng_oldid
)
continue
# --- check before any old ID retrieve
if
not
match
(
"
\
d+$"
,
this_ng_oldid
):
print
(
"WARN: (skip line) bad ID at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
# --- check if not a duplicate string
if
this_ng_term
in
imported_ngrams_oldids
:
ignored_oldids
.
append
(
this_ng_oldid
)
print
(
"WARN: (skip line) term appears more than once (previous id:
%
i) at CSV
%
s:l.
%
i"
%
(
imported_ngrams_oldids
[
this_ng_term
],
fname
,
i
))
continue
else
:
this_ng_oldid
=
int
(
this_ng_oldid
)
# --- check correct list type
if
not
this_list_type
in
[
'stop'
,
'main'
,
'map'
]:
ignored_oldids
.
append
(
this_ng_oldid
)
print
(
"WARN: (skip line) wrong list type at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
continue
...
...
@@ -455,12 +477,13 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
# ======== Synonyms =========
for
(
x
,
y
)
in
imported_groupings
:
new_mainform_id
=
old_to_new_id_map
[
x
]
new_subform_id
=
old_to_new_id_map
[
y
]
if
(
x
not
in
ignored_oldids
)
and
(
y
not
in
ignored_oldids
):
new_mainform_id
=
old_to_new_id_map
[
x
]
new_subform_id
=
old_to_new_id_map
[
y
]
# /!\ Translations use (subform => mainform) order
result
[
'groupings'
]
.
items
[
new_subform_id
]
=
new_mainform_id
n_group_relations
+=
1
# /!\ Translations use (subform => mainform) order
result
[
'groupings'
]
.
items
[
new_subform_id
]
=
new_mainform_id
n_group_relations
+=
1
# ------------------------------------------------------------------
print
(
"IMPORT: read
%
i lines from the CSV"
%
n_read_lines
)
...
...
@@ -533,14 +556,24 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
onto_corpus
.
children
(
"MAPLIST"
)
.
first
()
.
id
]
# retrieve old data...
old_group_id
=
onto_corpus
.
children
(
"GROUPLIST"
)
.
first
()
.
id
# retrieve old data into old_lists[list_type]...
# ----------------------------------------------
for
lid
,
linfo
in
enumerate
(
linfos
):
list_type
=
linfo
[
'key'
]
if
list_type
not
in
del_originals
:
old_lists
[
list_type
]
=
UnweightedList
(
tgt_nodeids
[
lid
])
# NB can't use UnweightedList(tgt_nodeids[lid])
# because we need to include out-of-list subforms
list_ngrams_q
=
query_list
(
tgt_nodeids
[
lid
],
groupings_id
=
old_group_id
)
old_lists
[
list_type
]
=
UnweightedList
(
list_ngrams_q
.
all
())
else
:
# ...or use empty objects if replacing old list
# ----------------------------------------------
old_lists
[
list_type
]
=
UnweightedList
()
print
(
"MERGE: ignoring old
%
s which will be overwritten"
%
linfo
[
'name'
])
# ======== Merging all involved ngrams =========
...
...
@@ -569,14 +602,13 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
# from mainforms but we'll force them into same list
# after we merge the groups
#
del old_lists
#
del new_lists['stop']
#
del new_lists['main']
#
del new_lists['map']
del
old_lists
del
new_lists
[
'stop'
]
del
new_lists
[
'main'
]
del
new_lists
[
'map'
]
# ======== Merging old and new groups =========
# get the arcs already in the target DB (directed couples)
old_group_id
=
onto_corpus
.
children
(
"GROUPLIST"
)
.
first
()
.
id
previous_links
=
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
...
...
@@ -593,20 +625,24 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
for
(
y
,
x
)
in
new_lists
[
'groupings'
]
.
items
.
items
():
add_link
((
x
,
y
))
n_links_added
+=
1
#
del new_lists
del
new_lists
# group_union: joins 2 different synonym-links lists into 1 new list
new_links
=
group_union
(
previous_links
,
translated_imported_links
)
#
del previous_links
#
del translated_imported_links
del
previous_links
del
translated_imported_links
n_links_after
=
len
(
new_links
)
merged_group
=
Translations
([(
y
,
x
)
for
(
x
,
y
)
in
new_links
])
#
del new_links
del
new_links
print
(
"IMPORT: groupings updated (links before/added/after:
%
i/
%
i/
%
i)"
%
(
n_links_previous
,
n_links_added
,
n_links_after
))
# ======== Overwrite old data with new =========
merged_group
.
save
(
old_group_id
)
print
(
"MERGE: groupings
%
i updated (links before/added/after:
%
i/
%
i/
%
i)"
%
(
old_group_id
,
n_links_previous
,
n_links_added
,
n_links_after
))
# ======== Target list(s) append data =========
# if list 2 => write in both tgt_data_lists [1,2]
...
...
@@ -641,7 +677,6 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
list_type
=
linfos
[
lid
][
'key'
]
merged_results
[
list_type
]
.
items
.
add
(
ng_id
)
# print("IMPORT: added %i elements in the lists indices" % added_nd_ng)
# ======== Overwrite old data with new =========
...
...
@@ -650,3 +685,6 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
list_type
=
info
[
'key'
]
result
=
merged_results
[
list_type
]
result
.
save
(
tgt_id
)
print
(
"MERGE:
%
s
%
i updated (new size:
%
i)"
%
(
info
[
'name'
],
tgt_id
,
len
(
merged_results
[
list_type
]
.
items
)))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment