Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
0e614f4d
Commit
0e614f4d
authored
Jun 14, 2016
by
Romain Loth
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
import terms table: better stats log and result format
parent
f9e95ef2
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
59 additions
and
20 deletions
+59
-20
db.py
gargantext/util/db.py
+25
-2
ngramlists_tools.py
gargantext/util/ngramlists_tools.py
+34
-18
No files found.
gargantext/util/db.py
View file @
0e614f4d
...
@@ -86,7 +86,19 @@ class bulk_insert:
...
@@ -86,7 +86,19 @@ class bulk_insert:
readline
=
read
readline
=
read
def
bulk_insert_ifnotexists
(
model
,
uniquekey
,
fields
,
data
,
cursor
=
None
):
def
bulk_insert_ifnotexists
(
model
,
uniquekey
,
fields
,
data
,
cursor
=
None
,
do_stats
=
False
):
"""
Inserts bulk data with an intermediate check on a uniquekey
(ex: Ngram.terms) to see if the row existed before.
If the row already existed we just retrieve its id.
If it didn't exist we create it and retrieve the id.
Returns a dict {uniquekey => id}
Option:
do stats: also returns the number of those that had no previous id
"""
if
cursor
is
None
:
if
cursor
is
None
:
db
,
cursor
=
get_cursor
()
db
,
cursor
=
get_cursor
()
mustcommit
=
True
mustcommit
=
True
...
@@ -109,6 +121,7 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
...
@@ -109,6 +121,7 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
sourcetable
=
model
.
__tablename__
,
sourcetable
=
model
.
__tablename__
,
uniquecolumn
=
uniquekey
,
uniquecolumn
=
uniquekey
,
))
))
# insert what has not been found to the real table
# insert what has not been found to the real table
cursor
.
execute
(
'''
cursor
.
execute
(
'''
INSERT INTO {sourcetable} ({columns})
INSERT INTO {sourcetable} ({columns})
...
@@ -119,6 +132,11 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
...
@@ -119,6 +132,11 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
sourcetable
=
model
.
__tablename__
,
sourcetable
=
model
.
__tablename__
,
columns
=
', '
.
join
(
fields
),
columns
=
', '
.
join
(
fields
),
))
))
if
do_stats
:
# remember how many rows we inserted just now
n_new
=
cursor
.
rowcount
# retrieve dict associating unique key to id
# retrieve dict associating unique key to id
cursor
.
execute
(
'''
cursor
.
execute
(
'''
SELECT source.id, source.{uniquecolumn}
SELECT source.id, source.{uniquecolumn}
...
@@ -130,10 +148,15 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
...
@@ -130,10 +148,15 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None):
columns
=
', '
.
join
(
fields
),
columns
=
', '
.
join
(
fields
),
))
))
result
=
{
result
=
{
# term : new_id
row
[
1
]:
row
[
0
]
for
row
in
cursor
.
fetchall
()
row
[
1
]:
row
[
0
]
for
row
in
cursor
.
fetchall
()
}
}
# this is the end!
# this is the end!
cursor
.
execute
(
'DROP TABLE __tmp__'
)
cursor
.
execute
(
'DROP TABLE __tmp__'
)
if
mustcommit
:
if
mustcommit
:
db
.
commit
()
db
.
commit
()
return
result
if
do_stats
:
return
result
,
n_new
else
:
return
result
gargantext/util/ngramlists_tools.py
View file @
0e614f4d
...
@@ -253,7 +253,6 @@ def export_ngramlists(node,fname=None,delimiter="\t",titles=False):
...
@@ -253,7 +253,6 @@ def export_ngramlists(node,fname=None,delimiter="\t",titles=False):
this_corpus_all_rows
=
map_csv_rows
+
miam_csv_rows
+
stop_csv_rows
this_corpus_all_rows
=
map_csv_rows
+
miam_csv_rows
+
stop_csv_rows
# choice of output: file or string
# choice of output: file or string
print
(
type
(
fname
))
if
fname
==
None
:
if
fname
==
None
:
out_file
=
StringIO
()
out_file
=
StringIO
()
elif
type
(
fname
)
==
str
:
elif
type
(
fname
)
==
str
:
...
@@ -317,6 +316,8 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
...
@@ -317,6 +316,8 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
(creates absent ngrams if necessary)
(creates absent ngrams if necessary)
=> use the new ids to map the relations involving the old ones
=> use the new ids to map the relations involving the old ones
NB: the creation of MAINLIST also adds all elements from the MAPLIST
NB: To merge the imported lists into a corpus node's lists,
NB: To merge the imported lists into a corpus node's lists,
chain this function with merge_ngramlists()
chain this function with merge_ngramlists()
'''
'''
...
@@ -346,11 +347,17 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
...
@@ -346,11 +347,17 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
quoting
=
QUOTE_MINIMAL
quoting
=
QUOTE_MINIMAL
)
)
# for stats
n_read_lines
=
0
n_read_lines
=
0
n_total_ng
=
0
n_added_ng
=
0
n_group_relations
=
0
# load CSV + initial checks
# load CSV + initial checks
for
i
,
csv_row
in
enumerate
(
ngrams_csv_rows
):
for
i
,
csv_row
in
enumerate
(
ngrams_csv_rows
):
print
(
"---------------READ LINE
%
i"
%
i
)
# fyi
n_read_lines
+=
1
# print("---------------READ LINE %i" % i)
try
:
try
:
this_ng_oldid
=
str
(
csv_row
[
0
])
this_ng_oldid
=
str
(
csv_row
[
0
])
this_ng_term
=
str
(
csv_row
[
1
])
this_ng_term
=
str
(
csv_row
[
1
])
...
@@ -398,19 +405,19 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
...
@@ -398,19 +405,19 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
(
this_ng_oldid
,
external_subform_id
)
(
this_ng_oldid
,
external_subform_id
)
)
)
# fyi
n_read_lines
+=
1
# end of CSV read
# end of CSV read
fh
.
close
()
fh
.
close
()
# ======== ngram save + id lookup =========
# ======== ngram save + id lookup =========
# returns a dict {term => id}
n_total_ng
=
len
(
imported_ngrams_dbdata
)
new_ngrams_ids
=
bulk_insert_ifnotexists
(
# returns a dict {term => id} and a count of inserted ones
(
new_ngrams_ids
,
n_added_ng
)
=
bulk_insert_ifnotexists
(
model
=
Ngram
,
model
=
Ngram
,
uniquekey
=
'terms'
,
uniquekey
=
'terms'
,
fields
=
(
'terms'
,
'n'
),
fields
=
(
'terms'
,
'n'
),
data
=
imported_ngrams_dbdata
data
=
imported_ngrams_dbdata
,
do_stats
=
True
)
)
del
imported_ngrams_dbdata
del
imported_ngrams_dbdata
...
@@ -421,34 +428,43 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
...
@@ -421,34 +428,43 @@ def import_ngramlists(fname, delimiter='\t', group_delimiter='|'):
del
new_ngrams_ids
del
new_ngrams_ids
del
imported_ngrams_oldids
del
imported_ngrams_oldids
print
(
old_to_new_id_map
)
#
print(old_to_new_id_map)
print
(
import_nodes_ngrams
)
#
print(import_nodes_ngrams)
# ======== Import into lists =========
# ======== Import into lists =========
# 3 x abstract lists
# 3 x abstract lists
+ 1 translations
new_lists
=
{
result
=
{
'map'
:
UnweightedList
(),
'map'
:
UnweightedList
(),
'main'
:
UnweightedList
(),
'main'
:
UnweightedList
(),
'stop'
:
UnweightedList
()
'stop'
:
UnweightedList
(),
'groupings'
:
Translations
()
}
}
for
list_type
in
import_nodes_ngrams
:
for
list_type
in
import_nodes_ngrams
:
for
old_id
in
import_nodes_ngrams
[
list_type
]:
for
old_id
in
import_nodes_ngrams
[
list_type
]:
new_id
=
old_to_new_id_map
[
old_id
]
new_id
=
old_to_new_id_map
[
old_id
]
# add to the abstract list
# add to the abstract list
new_lists
[
list_type
]
.
items
.
add
(
new_id
)
result
[
list_type
]
.
items
.
add
(
new_id
)
# ======== Synonyms =========
# for main also add map elements
new_groups
=
Translations
()
if
list_type
==
'main'
:
for
old_id
in
import_nodes_ngrams
[
'map'
]:
new_id
=
old_to_new_id_map
[
old_id
]
result
[
'main'
]
.
items
.
add
(
new_id
)
# ======== Synonyms =========
for
(
x
,
y
)
in
imported_groupings
:
for
(
x
,
y
)
in
imported_groupings
:
new_mainform_id
=
old_to_new_id_map
[
x
]
new_mainform_id
=
old_to_new_id_map
[
x
]
new_subform_id
=
old_to_new_id_map
[
y
]
new_subform_id
=
old_to_new_id_map
[
y
]
# /!\ Translations use (subform => mainform) order
# /!\ Translations use (subform => mainform) order
new_groups
.
items
[
new_subform_id
]
=
new_mainform_id
result
[
'groupings'
]
.
items
[
new_subform_id
]
=
new_mainform_id
n_group_relations
+=
1
# ------------------------------------------------------------------
# ------------------------------------------------------------------
print
(
"IMPORT: read
%
i lines from the CSV"
%
n_read_lines
)
print
(
"IMPORT: read
%
i lines from the CSV"
%
n_read_lines
)
print
(
"IMPORT: read
%
i terms (
%
i added and
%
i already existing)"
%
(
n_total_ng
,
n_added_ng
,
n_total_ng
-
n_added_ng
)
)
print
(
"IMPORT: read
%
i grouping relations"
%
n_group_relations
)
return
(
new_lists
,
new_groups
)
return
result
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment