Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
e24efe96
Commit
e24efe96
authored
Sep 08, 2017
by
Alexandre Delanoë
Browse files
Options
Browse Files
Download
Plain Diff
Merge remote-tracking branch 'origin/simon-unstable-lists-fix' into unstable
parents
06f55400
224eae66
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
170 additions
and
195 deletions
+170
-195
group_tools.py
gargantext/util/group_tools.py
+17
-14
lists.py
gargantext/util/lists.py
+3
-0
ngramlists_tools.py
gargantext/util/ngramlists_tools.py
+42
-31
CSV.py
gargantext/util/parsers/CSV.py
+51
-112
ngrams_extraction.py
gargantext/util/toolchain/ngrams_extraction.py
+38
-37
project.html
templates/pages/projects/project.html
+19
-1
No files found.
gargantext/util/group_tools.py
View file @
e24efe96
...
...
@@ -7,7 +7,7 @@ from gargantext.util.db import session, aliased
from
gargantext.models
import
Ngram
,
NodeNgramNgram
from
igraph
import
Graph
# for group_union
def
query_groups
(
groupings_id
,
details
=
False
):
def
query_groups
(
groupings_id
,
details
=
False
,
sort
=
False
):
"""
Listing of couples (mainform, subform)
aka (ngram1_id, ngram2_id)
...
...
@@ -15,24 +15,27 @@ def query_groups(groupings_id, details=False):
Parameter:
- details: if False, just send the array of couples
if True, send quadruplets with (ngram1_id, term1, ngram2_id, term2)
- sort: order results by terms of ngram1 then ngram2
"""
if
details
or
sort
:
Ngram1
,
Ngram2
=
Ngram
,
aliased
(
Ngram
)
if
not
details
:
# simple contents
query
=
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
columns
=
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
else
:
# detailed contents (id + terms)
Ngram1
=
aliased
(
Ngram
)
Ngram2
=
aliased
(
Ngram
)
query
=
(
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
Ngram1
.
terms
,
NodeNgramNgram
.
ngram2_id
,
Ngram2
.
terms
,
)
.
join
(
Ngram1
,
NodeNgramNgram
.
ngram1_id
==
Ngram1
.
id
)
.
join
(
Ngram2
,
NodeNgramNgram
.
ngram2_id
==
Ngram2
.
id
)
)
columns
=
(
Ngram1
.
id
,
Ngram1
.
terms
,
Ngram2
.
id
,
Ngram2
.
terms
)
query
=
session
.
query
(
*
columns
)
if
details
or
sort
:
query
=
(
query
.
join
(
Ngram1
,
NodeNgramNgram
.
ngram1_id
==
Ngram1
.
id
)
.
join
(
Ngram2
,
NodeNgramNgram
.
ngram2_id
==
Ngram2
.
id
))
if
sort
:
query
=
query
.
order_by
(
Ngram1
.
terms
,
Ngram2
.
terms
)
# main filter
# -----------
...
...
gargantext/util/lists.py
View file @
e24efe96
...
...
@@ -50,6 +50,9 @@ class _BaseClass:
else
:
return
NotImplemented
def
__len__
(
self
):
return
len
(
self
.
items
)
def
__repr__
(
self
):
items
=
self
.
items
if
isinstance
(
items
,
defaultdict
):
...
...
gargantext/util/ngramlists_tools.py
View file @
e24efe96
...
...
@@ -8,8 +8,7 @@ Tools to work with ngramlists (MAINLIST, MAPLIST, STOPLIST)
"""
from
gargantext.util.group_tools
import
query_groups
,
group_union
from
gargantext.util.db
import
session
,
desc
,
func
,
\
bulk_insert_ifnotexists
from
gargantext.util.db
import
session
,
bulk_insert_ifnotexists
from
gargantext.models
import
Ngram
,
NodeNgram
,
NodeNodeNgram
,
\
NodeNgramNgram
,
Node
...
...
@@ -25,7 +24,6 @@ from gargantext.util.toolchain.ngrams_extraction import normalize_forms
# merge will also index the new ngrams in the docs of the corpus
from
gargantext.util.toolchain.ngrams_addition
import
index_new_ngrams
from
sqlalchemy.sql
import
exists
from
os
import
path
from
csv
import
writer
,
reader
,
QUOTE_MINIMAL
from
collections
import
defaultdict
...
...
@@ -35,8 +33,8 @@ from celery import shared_task
def
query_list
(
list_id
,
pagination_limit
=
None
,
pagination_offset
=
None
,
details
=
False
,
scoring_metric_id
=
None
,
groupings_id
=
None
):
details
=
False
,
scoring_metric_id
=
None
,
groupings_id
=
None
,
sort
=
False
):
"""
Paginated listing of ngram_ids in a NodeNgram lists.
...
...
@@ -51,6 +49,7 @@ def query_list(list_id,
(for details and sorting)
- groupings_id: optional id of a list of grouping relations (synonyms)
(each synonym will be added to the list if not already in there)
- sort: order by Ngram.terms (not possible if details is False)
FIXME: subforms appended recently and not generalized enough
=> add a common part for all "if groupings_id"
...
...
@@ -114,7 +113,10 @@ def query_list(list_id,
query
=
query
.
limit
(
pagination_limit
)
if
pagination_offset
:
query
=
query
.
offset
(
pagination_offsets
)
query
=
query
.
offset
(
pagination_offset
)
if
details
and
sort
:
query
=
query
.
order_by
(
Ngram
.
terms
)
return
query
...
...
@@ -175,9 +177,7 @@ def ngrams_to_csv_rows(ngram_objs, ngram_dico={}, group_infos={},
# 3 columns = |status, | mainform, | forms
# (type_of_list) ( term ) ( subterm1|&|subterm2 )
csv_rows
.
append
(
[
list_type
,
ng_obj
.
terms
,
this_grouped_terms
]
)
csv_rows
.
append
([
list_type
,
ng_obj
.
terms
,
this_grouped_terms
])
return
csv_rows
...
...
@@ -220,9 +220,10 @@ def export_ngramlists(node,fname=None,delimiter=DEFAULT_CSV_DELIM,titles=True):
# listes de ngram_ids correspondantes
# ------------------------------------
# contenu: liste des objets ngrammes [(2562,"monterme",1),...]
stop_ngrams
=
query_list
(
stoplist_node
.
id
,
details
=
True
,
groupings_id
=
group_node
.
id
)
.
all
()
main_ngrams
=
query_list
(
mainlist_node
.
id
,
details
=
True
,
groupings_id
=
group_node
.
id
)
.
all
()
map_ngrams
=
query_list
(
maplist_node
.
id
,
details
=
True
,
groupings_id
=
group_node
.
id
)
.
all
()
stop_ngrams
,
main_ngrams
,
map_ngrams
=
(
query_list
(
n
.
id
,
details
=
True
,
groupings_id
=
group_node
.
id
,
sort
=
True
)
.
all
()
for
n
in
(
stoplist_node
,
mainlist_node
,
maplist_node
)
)
# pour debug ---------->8 --------------------
#~ stop_ngrams = stop_ngrams[0:10]
...
...
@@ -239,7 +240,7 @@ def export_ngramlists(node,fname=None,delimiter=DEFAULT_CSV_DELIM,titles=True):
# for the groups we got couples of ids in the DB
# -------------------
# ex: [(3544, 2353), (2787, 4032), ...]
group_ngram_id_couples
=
query_groups
(
group_node
.
id
)
.
all
(
)
group_ngram_id_couples
=
query_groups
(
group_node
.
id
,
sort
=
True
)
# we expend this to double structure for groups lookup
# 1) g['links'] = k couples (x,y_i) as a set [x => {y1,y2}]
...
...
@@ -386,6 +387,9 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
NB: To merge the imported lists into a corpus node's lists,
chain this function with merge_ngramlists()
'''
list_types
=
[
'stop'
,
'main'
,
'map'
]
# ---------------
# ngram storage
# ---------------
...
...
@@ -450,7 +454,6 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
# headers
if
i
==
0
:
n_cols
=
len
(
csv_row
)
for
j
,
colname
in
enumerate
(
csv_row
):
if
colname
in
[
'label'
,
'status'
,
'forms'
]:
columns
[
colname
]
=
j
...
...
@@ -497,31 +500,30 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
continue
# --- check correct list type
if
not
this_list_type
in
[
'stop'
,
'main'
,
'map'
]
:
if
not
this_list_type
in
list_types
:
print
(
"IMPORT WARN: (skip line) wrong list type at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
continue
# subforms can be duplicated (in forms and another label)
# but we must take care of unwanted other duplicates too
if
this_row_label
in
imported_unique_ngramstrs
:
print
(
"TODO IMPORT DUPL: (skip line) term appears more than once at CSV
%
s:l.
%
i"
%
(
fname
,
i
))
if
imported_unique_ngramstrs
.
get
(
this_row_label
)
==
1
:
print
(
"TODO IMPORT DUPL: (skip line) term
%
r
appears more than once at CSV
%
s:l.
%
i"
%
(
this_row_label
,
fname
,
i
))
# ================= Store the data ====================
# the ngram census
imported_unique_ngramstrs
[
this_row_label
]
=
True
imported_unique_ngramstrs
[
this_row_label
]
=
1
# and the "list to ngram" relation
imported_nodes_ngrams
[
this_list_type
]
.
append
(
this_row_label
)
# ====== Store synonyms from the import (if any) ======
if
len
(
this_row_forms
)
!=
0
:
other_terms
=
[]
for
raw_term_str
in
this_row_forms
.
split
(
group_delimiter
):
# each subform is also like an ngram declaration
term_str
=
normalize_forms
(
normalize_chars
(
raw_term_str
))
imported_unique_ngramstrs
[
term_str
]
=
True
imported_unique_ngramstrs
[
term_str
]
=
2
imported_nodes_ngrams
[
this_list_type
]
.
append
(
term_str
)
# the optional repeated mainform doesn't interest us
...
...
@@ -599,7 +601,10 @@ def import_ngramlists(the_file, delimiter=DEFAULT_CSV_DELIM,
%
(
n_total_ng
,
n_added_ng
,
n_total_ng
-
n_added_ng
)
)
print
(
"IMPORT: read
%
i grouping relations"
%
n_group_relations
)
# print("IMPORT RESULT", result)
list_counts
=
[(
typ
,
len
(
result
.
get
(
typ
)))
for
typ
in
list_types
]
list_counts
.
append
((
'total'
,
sum
(
x
[
1
]
for
x
in
list_counts
)))
print
(
"IMPORT: "
+
'; '
.
join
(
'
%
s
%
s'
%
stats
for
stats
in
list_counts
))
return
result
def
merge_ngramlists
(
new_lists
=
{},
onto_corpus
=
None
,
del_originals
=
[]):
...
...
@@ -707,9 +712,11 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
# ======== Merging all involved ngrams =========
# all memberships with resolved conflicts of interfering memberships
# all ngram memberships with resolved conflicts of interfering memberships
# (associates ngram ids with list types -- see linfos definition above)
resolved_memberships
=
{}
# iterates over each ngram of each list type for both old and new lists
for
list_set
in
[
old_lists
,
new_lists
]:
for
lid
,
info
in
enumerate
(
linfos
):
list_type
=
info
[
'key'
]
...
...
@@ -739,11 +746,11 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
# ======== Merging old and new groups =========
# get the arcs already in the target DB (directed couples)
previous_links
=
session
.
query
(
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
old_group_id
)
.
all
()
NodeNgramNgram
.
ngram1_id
,
NodeNgramNgram
.
ngram2_id
)
.
filter
(
NodeNgramNgram
.
node_id
==
old_group_id
)
.
all
()
n_links_previous
=
len
(
previous_links
)
...
...
@@ -811,7 +818,7 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
list_type
=
linfos
[
lid
][
'key'
]
merged_results
[
list_type
]
.
items
.
add
(
ng_id
)
#
print("IMPORT: added %i elements in the lists indices" % added_nd_ng)
print
(
"IMPORT: added
%
i elements in the lists indices"
%
added_nd_ng
)
# ======== Overwrite old data with new =========
for
lid
,
info
in
enumerate
(
linfos
):
...
...
@@ -834,10 +841,14 @@ def import_and_merge_ngramlists(file_contents, onto_corpus_id, overwrite=False):
"""
A single function to run import_ngramlists and merge_ngramlists together
"""
print
(
"import list"
)
print
(
"IMPORT CSV termlists file with
%
s lines in corpus
%
s (
%
s)"
%
(
len
(
file_contents
),
onto_corpus_id
,
'overwrite'
if
overwrite
else
'merge'
))
new_lists
=
import_ngramlists
(
file_contents
)
corpus_node
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
onto_corpus_id
)
.
first
(
)
corpus_node
=
session
.
query
(
Node
)
.
get
(
onto_corpus_id
)
# merge the new_lists onto those of the target corpus
del_originals
=
[
'stop'
,
'main'
,
'map'
]
if
overwrite
else
[]
...
...
gargantext/util/parsers/CSV.py
View file @
e24efe96
...
...
@@ -4,128 +4,67 @@ import sys
import
csv
csv
.
field_size_limit
(
sys
.
maxsize
)
import
numpy
as
np
import
os
class
CSVParser
(
Parser
):
DELIMITERS
=
",
\t
;|:"
def
CSVsample
(
self
,
small_contents
,
delim
)
:
reader
=
csv
.
reader
(
small_contents
,
delimiter
=
delim
)
def
detect_delimiter
(
self
,
lines
,
sample_size
=
10
)
:
sample
=
lines
[:
sample_size
]
Freqs
=
[]
for
row
in
reader
:
Freqs
.
append
(
len
(
row
))
# Compute frequency of each delimiter on each input line
delimiters_freqs
=
{
d
:
[
line
.
count
(
d
)
for
line
in
sample
]
for
d
in
self
.
DELIMITERS
}
return
Freqs
# Select delimiters with a standard deviation of zero, ie. delimiters
# for which we have the same number of fields on each line
selected_delimiters
=
[
(
d
,
np
.
sum
(
freqs
))
for
d
,
freqs
in
delimiters_freqs
.
items
()
if
any
(
freqs
)
and
np
.
std
(
freqs
)
==
0
]
if
selected_delimiters
:
# Choose the delimiter with highest frequency amongst selected ones
sorted_delimiters
=
sorted
(
selected_delimiters
,
key
=
lambda
x
:
x
[
1
])
return
sorted_delimiters
[
-
1
][
0
]
def
parse
(
self
,
filebuf
):
print
(
"CSV: parsing (assuming UTF-8 and LF line endings)"
)
contents
=
filebuf
.
read
()
.
decode
(
"UTF-8"
)
.
split
(
"
\n
"
)
sample_size
=
10
sample_contents
=
contents
[
0
:
sample_size
]
hyperdata_list
=
[]
# # = = = = [ Getting delimiters frequency ] = = = = #
PossibleDelimiters
=
[
','
,
' '
,
'
\t
'
,
';'
,
'|'
,
':'
]
AllDelimiters
=
{}
for
delim
in
PossibleDelimiters
:
AllDelimiters
[
delim
]
=
self
.
CSVsample
(
sample_contents
,
delim
)
# # = = = = [ / Getting delimiters frequency ] = = = = #
# # OUTPUT example:
# # AllDelimiters = {
# # '\t': [1, 1, 1, 1, 1],
# # ' ': [1, 13, 261, 348, 330],
# # ',': [15, 15, 15, 15, 15],
# # ';': [1, 1, 1, 1, 1],
# # '|': [1, 1, 1, 1, 1]
# # }
# # = = = = [ Stand.Dev=0 & Sum of delimiters ] = = = = #
Delimiters
=
[]
for
d
in
AllDelimiters
:
freqs
=
AllDelimiters
[
d
]
suma
=
np
.
sum
(
freqs
)
if
suma
>
0
:
std
=
np
.
std
(
freqs
)
# print [ d , suma , len(freqs) , std]
if
std
==
0
:
Delimiters
.
append
(
[
d
,
suma
,
len
(
freqs
)
,
std
]
)
# # = = = = [ / Stand.Dev=0 & Sum of delimiters ] = = = = #
# # OUTPUT example:
# # Delimiters = [
# # ['\t', 5, 5, 0.0],
# # [',', 75, 5, 0.0],
# # ['|', 5, 5, 0.0]
# # ]
# # = = = = [ Delimiter selection ] = = = = #
Sorted_Delims
=
sorted
(
Delimiters
,
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
HighestDelim
=
Sorted_Delims
[
0
][
0
]
# HighestDelim = ","
print
(
"CSV selected delimiter:"
,[
HighestDelim
])
# # = = = = [ / Delimiter selection ] = = = = #
# # = = = = [ First data coordinate ] = = = = #
Coords
=
{
"row"
:
-
1
,
"column"
:
-
1
}
# Filter out empty lines
contents
=
[
line
for
line
in
contents
if
line
.
strip
()]
# Delimiter auto-detection
delimiter
=
self
.
detect_delimiter
(
contents
,
sample_size
=
10
)
if
delimiter
is
None
:
raise
ValueError
(
"CSV: couldn't detect delimiter, bug or malformed data"
)
print
(
"CSV: selected delimiter:
%
r"
%
delimiter
)
# Parse CSV
reader
=
csv
.
reader
(
contents
,
delimiter
=
delimiter
)
# Get first not empty row and its fields (ie. header row), or (0, [])
first_row
,
headers
=
\
next
(((
i
,
fields
)
for
i
,
fields
in
enumerate
(
reader
)
if
any
(
fields
)),
(
0
,
[]))
# Get first not empty column of the first row, or 0
first_col
=
next
((
i
for
i
,
field
in
enumerate
(
headers
)
if
field
),
0
)
# Strip out potential empty fields in headers
headers
=
headers
[
first_col
:]
reader
=
csv
.
reader
(
contents
,
delimiter
=
HighestDelim
)
for
rownum
,
tokens
in
enumerate
(
reader
):
if
rownum
%
250
==
0
:
print
(
"CSV row: "
,
rownum
)
joined_tokens
=
""
.
join
(
tokens
)
if
Coords
[
"row"
]
<
0
and
len
(
joined_tokens
)
>
0
:
Coords
[
"row"
]
=
rownum
for
columnum
in
range
(
len
(
tokens
)):
t
=
tokens
[
columnum
]
if
len
(
t
)
>
0
:
Coords
[
"column"
]
=
columnum
break
# # = = = = [ / First data coordinate ] = = = = #
# # = = = = [ Setting Headers ] = = = = #
Headers_Int2Str
=
{}
reader
=
csv
.
reader
(
contents
,
delimiter
=
HighestDelim
)
for
rownum
,
tokens
in
enumerate
(
reader
):
if
rownum
>=
Coords
[
"row"
]:
for
columnum
in
range
(
Coords
[
"column"
],
len
(
tokens
)
):
t
=
tokens
[
columnum
]
Headers_Int2Str
[
columnum
]
=
t
break
# print("Headers_Int2Str")
# print(Headers_Int2Str)
# # = = = = [ / Setting Headers ] = = = = #
# # OUTPUT example:
# # Headers_Int2Str = {
# # 0: 'publication_date',
# # 1: 'publication_month',
# # 2: 'publication_second',
# # 3: 'abstract'
# # }
# # = = = = [ Reading the whole CSV and saving ] = = = = #
hyperdata_list
=
[]
reader
=
csv
.
reader
(
contents
,
delimiter
=
HighestDelim
)
for
rownum
,
tokens
in
enumerate
(
reader
):
if
rownum
>
Coords
[
"row"
]:
RecordDict
=
{}
for
columnum
in
range
(
Coords
[
"column"
],
len
(
tokens
)
):
data
=
tokens
[
columnum
]
RecordDict
[
Headers_Int2Str
[
columnum
]
]
=
data
if
len
(
RecordDict
.
keys
())
>
0
:
hyperdata_list
.
append
(
RecordDict
)
# # = = = = [ / Reading the whole CSV and saving ] = = = = #
return
hyperdata_list
# Return a generator of dictionaries with column labels as keys,
# filtering out empty rows
for
i
,
fields
in
enumerate
(
reader
):
if
i
%
500
==
0
:
print
(
"CSV: parsing row #
%
s..."
%
(
i
+
1
))
if
any
(
fields
):
yield
dict
(
zip
(
headers
,
fields
[
first_col
:]))
gargantext/util/toolchain/ngrams_extraction.py
View file @
e24efe96
...
...
@@ -81,44 +81,45 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
corpus
.
hyperdata
[
"skipped_docs"
]
.
append
(
document
.
id
)
corpus
.
save_hyperdata
()
continue
else
:
# ready !
tagger
=
tagger_bots
[
language_iso2
]
# to do verify if document has no KEYS to index
# eg: use set intersect (+ loop becomes direct! with no continue)
for
key
in
keys
:
try
:
value
=
document
.
hyperdata
[
str
(
key
)]
if
not
isinstance
(
value
,
str
):
#print("DBG wrong content in doc for key", key)
continue
# get ngrams
for
ngram
in
tagger
.
extract
(
value
):
tokens
=
tuple
(
normalize_forms
(
token
[
0
])
for
token
in
ngram
)
if
do_subngrams
:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],...]
subterms
=
subsequences
(
tokens
)
else
:
subterms
=
[
tokens
]
for
seqterm
in
subterms
:
ngram
=
' '
.
join
(
seqterm
)
nbwords
=
len
(
seqterm
)
nbchars
=
len
(
ngram
)
if
nbchars
>
1
:
if
nbchars
>
255
:
# max ngram length (DB constraint)
ngram
=
ngram
[:
255
]
# doc <=> ngram index
nodes_ngrams_count
[(
document
.
id
,
ngram
)]
+=
1
# add fields : terms n
ngrams_data
.
add
((
ngram
,
nbwords
,
))
except
:
#value not in doc
# ready !
tagger
=
tagger_bots
[
language_iso2
]
# to do verify if document has no KEYS to index
# eg: use set intersect (+ loop becomes direct! with no continue)
for
key
in
keys
:
try
:
value
=
document
.
hyperdata
[
str
(
key
)]
if
not
isinstance
(
value
,
str
):
#print("DBG wrong content in doc for key", key)
continue
# get ngrams
for
ngram
in
tagger
.
extract
(
value
):
normal_forms
=
(
normalize_forms
(
t
[
0
])
for
t
in
ngram
)
tokens
=
tuple
(
nf
for
nf
in
normal_forms
if
nf
)
if
do_subngrams
:
# ex tokens = ["very", "cool", "exemple"]
# subterms = [['very', 'cool'],...]
subterms
=
subsequences
(
tokens
)
else
:
subterms
=
[
tokens
]
for
seqterm
in
subterms
:
ngram
=
' '
.
join
(
seqterm
)
nbwords
=
len
(
seqterm
)
nbchars
=
len
(
ngram
)
if
nbchars
>
1
:
if
nbchars
>
255
:
# max ngram length (DB constraint)
ngram
=
ngram
[:
255
]
# doc <=> ngram index
nodes_ngrams_count
[(
document
.
id
,
ngram
)]
+=
1
# add fields : terms n
ngrams_data
.
add
((
ngram
,
nbwords
,
))
except
:
#value not in doc
continue
# integrate ngrams and nodes-ngrams
if
len
(
nodes_ngrams_count
)
>=
BATCH_NGRAMSEXTRACTION_SIZE
:
...
...
templates/pages/projects/project.html
View file @
e24efe96
...
...
@@ -440,11 +440,12 @@
// in the form "Add a corpus"
var
type
=
$
(
"#id_type"
).
val
()
var
file
=
$
(
"#id_file"
).
val
()
// 5 booleans
var
nameField
=
$
(
"#id_name"
).
val
()
!=
""
var
typeField
=
(
type
!=
""
)
&&
(
type
!=
"0"
)
var
fileField
=
$
(
"#id_file"
).
val
()
!=
""
var
fileField
=
file
!=
""
var
wantfileField
=
$
(
"#file_yes"
).
prop
(
"checked"
)
var
crawling
=
((
type
==
3
)
||
(
type
==
8
)
||
(
type
==
9
))
&&
!
wantfileField
...
...
@@ -457,6 +458,23 @@
if
(
!
crawling
)
{
$
(
"#submit_thing"
).
prop
(
'disabled'
,
!
(
nameField
&&
typeField
&&
fileField
))
}
// Automatically select CSV when type is undefined
// and we have a .csv file
if
(
!
typeField
&&
file
&&
file
.
match
(
/.csv$/i
))
{
// Get CSV type id
var
csv
=
$
(
'#id_type > option'
)
.
filter
(
function
()
{
return
$
(
this
).
text
()
===
'CSV'
})
.
attr
(
'value'
)
// Select CSV type
$
(
'#id_type'
).
val
(
csv
)
// Focus on name field
setTimeout
(
function
()
{
$
(
"#id_name"
).
focus
()
})
}
}
function
bringDaNoise
()
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment