Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
f0cc9050
Commit
f0cc9050
authored
Oct 22, 2015
by
delanoe
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[FEAT] adding option to tokenize monograms without nlp, + stop tools
parent
dcfe453b
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
188 additions
and
43 deletions
+188
-43
celery.py
gargantext_web/celery.py
+1
-1
views.py
gargantext_web/views.py
+11
-3
init.py
init.py
+15
-0
group.py
ngram/group.py
+5
-3
miam.py
ngram/miam.py
+40
-0
stop.py
ngram/stop.py
+94
-21
tools.py
ngram/tools.py
+0
-3
corpustools.py
parsing/corpustools.py
+22
-12
No files found.
gargantext_web/celery.py
View file @
f0cc9050
...
@@ -42,7 +42,7 @@ def apply_workflow(corpus_id):
...
@@ -42,7 +42,7 @@ def apply_workflow(corpus_id):
parse_resources
(
corpus
)
parse_resources
(
corpus
)
update_processing
(
corpus
,
2
)
update_processing
(
corpus
,
2
)
extract_ngrams
(
corpus
,
[
'title'
,
'abstract'
])
extract_ngrams
(
corpus
,
[
'title'
,
'abstract'
]
,
nlp
=
True
)
update_processing
(
corpus
,
3
)
update_processing
(
corpus
,
3
)
ngram_workflow
(
corpus
)
ngram_workflow
(
corpus
)
...
...
gargantext_web/views.py
View file @
f0cc9050
...
@@ -699,9 +699,17 @@ def sankey_csv(request, corpus_id):
...
@@ -699,9 +699,17 @@ def sankey_csv(request, corpus_id):
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
data
=
[
data
=
[
[
"source"
,
"target"
,
"value"
]
[
"source"
,
"target"
,
"value"
]
,
[
"Elvis_1"
,
"Elvis_2"
,
1
]
,
[
"Comment_1"
,
"Theme_1"
,
1
]
,
[
"Elvis_2"
,
"Elvis_3"
,
2
]
,
[
"Barry"
,
"Elvis_3"
,
2
]
,
[
"Comment_2"
,
"Theme_2"
,
2
]
,
[
"Comment_3"
,
"Theme_2"
,
2
]
,
[
"Comment_7"
,
"Theme_1"
,
2
]
,
[
"Comment_8"
,
"Theme_3"
,
2
]
,
[
"Theme_1"
,
"Reco_par_1"
,
2
]
,
[
"Theme_2"
,
"Reco_par_2"
,
2
]
,
[
"Theme_2"
,
"Reco_par_5"
,
2
]
,
[
"Theme_3"
,
"Reco_par_5"
,
1
]
]
]
return
(
CsvHttpResponse
(
data
))
return
(
CsvHttpResponse
(
data
))
...
...
init.py
View file @
f0cc9050
...
@@ -151,4 +151,19 @@ session.commit()
...
@@ -151,4 +151,19 @@ session.commit()
###f.close()
###f.close()
##
##
##
##
from
ngram.stop
import
importStopList
root
=
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Root'
]
.
id
)
.
first
()
importStopList
(
root
,
'/srv/gargantext/init/stop_lists/fr.txt'
,
'fr'
)
importStopList
(
root
,
'/srv/gargantext/init/stop_lists/en.txt'
,
'en'
)
root
=
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Root'
]
.
id
)
.
first
()
#importStopList(root, '/srv/gargantext/init/stop_lists/fr.txt', 'fr')
importStopList
(
root
,
'/srv/gargantext/init/stop_lists/en.txt'
,
'en'
)
#exit()
#exit()
ngram/group.py
View file @
f0cc9050
...
@@ -60,15 +60,17 @@ def getNgrams(corpus=None, limit_inf=600, limit_sup=3000):
...
@@ -60,15 +60,17 @@ def getNgrams(corpus=None, limit_inf=600, limit_sup=3000):
#print([n for n in tfidf_ngrams])
#print([n for n in tfidf_ngrams])
def
list2set
(
_list
,
_set
):
def
list2set
(
_list
):
_set
=
set
()
for
n
in
_list
:
for
n
in
_list
:
_set
.
add
((
n
[
0
],
n
[
1
]))
_set
.
add
((
n
[
0
],
n
[
1
]))
return
(
_set
)
cvalue_set
=
set
()
cvalue_set
=
set
()
spec_set
=
set
()
spec_set
=
set
()
list2set
(
cvalue_ngrams
,
cvalue_set
)
cvalue_set
=
list2set
(
cvalue_ngrams
)
list2set
(
spec_ngrams
,
spec_set
)
spec_set
=
list2set
(
spec_ngrams
)
cvalue_setDiff
=
cvalue_set
.
difference
(
spec_set
)
cvalue_setDiff
=
cvalue_set
.
difference
(
spec_set
)
...
...
ngram/miam.py
View file @
f0cc9050
...
@@ -12,6 +12,8 @@ from sqlalchemy import desc, asc, or_, and_, Date, cast, select
...
@@ -12,6 +12,8 @@ from sqlalchemy import desc, asc, or_, and_, Date, cast, select
from
sqlalchemy
import
literal_column
from
sqlalchemy
import
literal_column
from
sqlalchemy.orm
import
aliased
from
sqlalchemy.orm
import
aliased
from
ngram.tools
import
insert_ngrams
import
csv
def
compute_miam
(
corpus
,
limit
=
500
):
def
compute_miam
(
corpus
,
limit
=
500
):
'''
'''
...
@@ -50,5 +52,43 @@ def compute_miam(corpus,limit=500):
...
@@ -50,5 +52,43 @@ def compute_miam(corpus,limit=500):
dbg
.
show
(
'Miam computed'
)
dbg
.
show
(
'Miam computed'
)
def
insert_miam
(
corpus
,
ngrams
=
None
,
path_file_csv
=
None
):
dbg
=
DebugTime
(
'Corpus #
%
d - computing Miam'
%
corpus
.
id
)
node_miam
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
node_miam
.
id
)
.
delete
()
session
.
commit
()
stop_words
=
set
()
miam_words
=
set
()
if
path_file_csv
is
not
None
:
file_csv
=
open
(
path_file_csv
,
"r"
)
reader
=
csv
.
reader
(
file_csv
,
delimiter
=
','
)
for
line
in
reader
:
word
=
line
[
0
]
tag
=
line
[
4
]
if
tag
==
'1'
:
miam_words
.
add
((
word
,
1
))
elif
tag
==
'0'
:
stop_words
.
add
((
word
,
1
))
miam_ids
=
insert_ngrams
(
miam_words
)
print
(
miam_ids
)
limit
=
len
(
list
(
miam_words
))
data
=
zip
(
[
node_miam
.
id
for
i
in
range
(
1
,
limit
)]
,
[
miam_ids
[
n
]
for
n
in
miam_ids
.
keys
()]
,
[
1
for
i
in
range
(
1
,
limit
)]
)
#print([d for d in data])
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
file_csv
.
close
()
dbg
.
show
(
'Miam computed'
)
#corpus = session.query(Node).filter(Node.id==556113).first()
#insert_miam(corpus=corpus, path_file_csv="Thesaurus_tag.csv")
ngram/stop.py
View file @
f0cc9050
...
@@ -2,49 +2,122 @@
...
@@ -2,49 +2,122 @@
#from admin.env import *
#from admin.env import *
#from ngram.stemLem import *
#from ngram.stemLem import *
import
re
from
admin.utils
import
PrintException
from
admin.utils
import
PrintException
from
gargantext_web.db
import
NodeNgram
,
NodeNodeNgram
from
gargantext_web.db
import
NodeNgram
,
NodeNodeNgram
from
gargantext_web.db
import
get_or_create_node
,
session
from
gargantext_web.db
import
cache
,
session
,
get_or_create_node
from
sqlalchemy.sql
import
func
from
sqlalchemy.sql
import
func
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
from
sqlalchemy
import
desc
,
asc
,
or_
,
and_
,
Date
,
cast
,
select
from
sqlalchemy
import
literal_column
from
sqlalchemy
import
literal_column
from
sqlalchemy.orm
import
aliased
from
sqlalchemy.orm
import
aliased
from
ngram.tools
import
insert_ngrams
from
analysis.lists
import
WeightedList
,
UnweightedList
def
importStopList
(
node
,
filename
,
language
=
'fr'
):
with
open
(
filename
,
"r"
)
as
f
:
stop_list
=
f
.
read
()
.
splitlines
()
stop_words
=
set
(
stop_list
)
stop_ids
=
insert_ngrams
([(
word
,
len
(
word
.
split
(
' '
)))
for
word
in
stop_words
])
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
node
)
stop_node
.
language_id
=
cache
.
Language
[
language
]
.
id
session
.
add
(
stop_node
)
session
.
commit
()
size
=
len
(
list
(
stop_words
))
def
computeStop
(
corpus
,
size
=
100
):
data
=
zip
(
[
stop_node
.
id
for
i
in
range
(
0
,
size
)]
,
[
stop_ids
[
word
]
for
word
in
list
(
stop_words
)]
,
[
-
1
for
i
in
range
(
0
,
size
)]
)
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
def
isStopWord
(
ngram
,
stop_words
=
None
):
'''
ngram :: (Int, String) => (ngram_id, ngram_terms)
stop_words :: Set of String
(to avoid SQL query each time isStopWord is invoked, get in as parameter)
'''
word
=
ngram
[
1
]
if
word
in
stop_words
:
return
(
True
)
def
test_match
(
word
,
regex
):
format_regex
=
re
.
compile
(
regex
)
if
format_regex
.
match
(
word
)
:
return
(
True
)
for
regex
in
[
"(.*)
\
d(.*)"
,
"^.{1,2}$"
,
"(.*)(
\
.)(.*)"
,
"(.*)(
\
,)(.*)"
,
"(.*)(study)(.*)"
,
"(.*)(result)(.*)"
,
"(.*)(année)(.*)"
,
"(.*)(temps)(.*)"
,
"(.*)(
%
)(.*)"
,
"(.*)(
\
{)(.*)"
,
"(.*)(terme)(.*)"
,
"(.*)(différent)(.*)"
,
"(.*)(travers)(.*)"
,
"(.*)(:|
\
|)(.*)"
]
:
if
test_match
(
word
,
regex
)
is
True
:
return
(
True
)
def
compute_stop
(
corpus
,
size
=
2000
,
debug
=
False
):
'''
'''
do some statitics on all stop lists of database of the same type
do some statitics on all stop lists of database of the same type
'''
'''
node_stop
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
stop_node
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
corpus
)
Stop
=
aliased
(
NodeNgram
)
miam_node
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
top_spec
=
(
session
.
query
(
NodeNodeNgram
.
ngram_id
,
NodeNodeNgram
.
score
)
# TODO do a function to get all stop words with social scores
.
outerjoin
(
Stop
,
Stop
.
ngram_id
==
NodeNodeNgram
.
ngram_id
)
root
=
session
.
query
(
Node
)
.
filter
(
Node
.
type_id
==
cache
.
NodeType
[
'Root'
]
.
id
)
.
first
()
.
filter
(
NodeNodeNgram
.
nodex_id
==
node_spec
.
id
)
root_stop_id
=
get_or_create_node
(
nodetype
=
'StopList'
,
corpus
=
root
)
.
id
.
filter
(
Stop
.
node_id
==
node_stop
.
id
)
.
order_by
(
desc
(
NodeNodeNgram
.
score
))
stop_words
=
(
session
.
query
(
Ngram
.
terms
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
filter
(
NodeNgram
.
node_id
==
root_stop_id
)
.
all
()
)
top_words
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
)
.
join
(
NodeNgram
,
NodeNgram
.
ngram_id
==
Ngram
.
id
)
.
filter
(
NodeNgram
.
node_id
==
miam_node
.
id
)
.
order_by
(
desc
(
NodeNgram
.
weight
))
.
limit
(
size
)
.
limit
(
size
)
)
)
ngrams_to_stop
=
filter
(
lambda
x
:
isStopWord
(
x
,
stop_words
=
stop_words
),
top_words
)
stop
=
WeightedList
({
n
[
0
]
:
-
1
for
n
in
ngrams_to_stop
})
stop
.
save
(
stop_node
.
id
)
node_miam
=
get_or_create_node
(
nodetype
=
'MiamList'
,
corpus
=
corpus
)
miam
=
UnweightedList
(
miam_node
.
id
)
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
node_miam
.
id
)
.
delete
()
data
=
zip
(
new_miam
=
miam
-
stop
[
node_miam
.
id
for
i
in
range
(
1
,
size
)]
new_miam
.
save
(
miam_node
.
id
)
,
[
1
for
i
in
range
(
1
,
size
)]
,
[
n
[
0
]
for
n
in
top_spec
]
)
#print([d for d in data])
bulk_insert
(
NodeNgram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
[
d
for
d
in
data
])
# data = zip(
# [stop_node.id for i in range(0,size)]
# , [ngram[0] for ngram in ngrams_to_stop]
# , [-1 for i in range(0,size)]
# )
# bulk_insert(NodeNgram, ['node_id', 'ngram_id', 'weight'], [d for d in data])
#corpus=session.query(Node).filter(Node.id==244250).first()
#computeMiam(corpus)
#corpus=session.query(Node).filter(Node.id==545461).first()
#compute_stop(corpus)
ngram/tools.py
View file @
f0cc9050
...
@@ -28,8 +28,6 @@ def insert_ngrams(ngrams,get='terms-id'):
...
@@ -28,8 +28,6 @@ def insert_ngrams(ngrams,get='terms-id'):
%
s AS ngram
%
s AS ngram
WHERE
WHERE
tmp__ngram.terms = ngram.terms
tmp__ngram.terms = ngram.terms
AND
tmp__ngram.n = ngram.n
'''
%
(
Ngram
.
__table__
.
name
,))
'''
%
(
Ngram
.
__table__
.
name
,))
cursor
.
execute
(
'''
cursor
.
execute
(
'''
...
@@ -67,7 +65,6 @@ def insert_ngrams(ngrams,get='terms-id'):
...
@@ -67,7 +65,6 @@ def insert_ngrams(ngrams,get='terms-id'):
db
.
commit
()
db
.
commit
()
return
(
ngram_ids
)
return
(
ngram_ids
)
def
insert_nodengramngram
(
nodengramngram
):
def
insert_nodengramngram
(
nodengramngram
):
db
,
cursor
=
get_cursor
()
db
,
cursor
=
get_cursor
()
...
...
parsing/corpustools.py
View file @
f0cc9050
...
@@ -177,6 +177,8 @@ def parse_resources(corpus, user=None, user_id=None):
...
@@ -177,6 +177,8 @@ def parse_resources(corpus, user=None, user_id=None):
# ngrams extraction
# ngrams extraction
from
.NgramsExtractors
import
EnglishNgramsExtractor
,
FrenchNgramsExtractor
,
NgramsExtractor
from
.NgramsExtractors
import
EnglishNgramsExtractor
,
FrenchNgramsExtractor
,
NgramsExtractor
from
nltk.tokenize
import
word_tokenize
,
wordpunct_tokenize
,
sent_tokenize
class
NgramsExtractors
(
defaultdict
):
class
NgramsExtractors
(
defaultdict
):
def
__init__
(
self
):
def
__init__
(
self
):
# English
# English
...
@@ -201,7 +203,7 @@ class NgramsExtractors(defaultdict):
...
@@ -201,7 +203,7 @@ class NgramsExtractors(defaultdict):
ngramsextractors
=
NgramsExtractors
()
ngramsextractors
=
NgramsExtractors
()
def
extract_ngrams
(
corpus
,
keys
):
def
extract_ngrams
(
corpus
,
keys
,
nlp
=
True
):
dbg
=
DebugTime
(
'Corpus #
%
d - ngrams'
%
corpus
.
id
)
dbg
=
DebugTime
(
'Corpus #
%
d - ngrams'
%
corpus
.
id
)
default_language_iso2
=
None
if
corpus
.
language_id
is
None
else
cache
.
Language
[
corpus
.
language_id
]
.
iso2
default_language_iso2
=
None
if
corpus
.
language_id
is
None
else
cache
.
Language
[
corpus
.
language_id
]
.
iso2
# query the hyperdata associated with the given keys
# query the hyperdata associated with the given keys
...
@@ -220,7 +222,7 @@ def extract_ngrams(corpus, keys):
...
@@ -220,7 +222,7 @@ def extract_ngrams(corpus, keys):
ngrams_data
=
set
()
ngrams_data
=
set
()
ngrams_language_data
=
set
()
ngrams_language_data
=
set
()
ngrams_tag_data
=
set
()
#
ngrams_tag_data = set()
node_ngram_list
=
defaultdict
(
lambda
:
defaultdict
(
int
))
node_ngram_list
=
defaultdict
(
lambda
:
defaultdict
(
int
))
for
nodeinfo
in
hyperdata_query
:
for
nodeinfo
in
hyperdata_query
:
...
@@ -237,17 +239,25 @@ def extract_ngrams(corpus, keys):
...
@@ -237,17 +239,25 @@ def extract_ngrams(corpus, keys):
ngramsextractor
=
ngramsextractors
[
language_iso2
]
ngramsextractor
=
ngramsextractors
[
language_iso2
]
for
text
in
nodeinfo
[
2
:]:
for
text
in
nodeinfo
[
2
:]:
if
text
is
not
None
and
len
(
text
):
if
text
is
not
None
and
len
(
text
):
ngrams
=
ngramsextractor
.
extract_ngrams
(
text
.
replace
(
"["
,
""
)
.
replace
(
"]"
,
""
))
if
nlp
==
True
:
ngrams
=
ngramsextractor
.
extract_ngrams
(
text
.
replace
(
"["
,
""
)
.
replace
(
"]"
,
""
))
else
:
ngrams
=
wordpunct_tokenize
(
text
.
lower
())
for
ngram
in
ngrams
:
for
ngram
in
ngrams
:
n
=
len
(
ngram
)
if
nlp
==
True
:
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
.
lower
()
n
=
len
(
ngram
)
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
.
lower
()
else
:
terms
=
ngram
n
=
1
# TODO BUG here
# TODO BUG here
if
n
==
1
:
#
if n == 1:
#tag_id = cache.Tag[ngram[0][1]].id
#tag_id = cache.Tag[ngram[0][1]].id
tag_id
=
1
#
tag_id = 1
#print('tag_id', tag_id)
#print('tag_id', tag_id)
elif
n
>
1
:
#
elif n > 1:
tag_id
=
1
#
tag_id = 1
#tag_id = cache.Tag[ngram[0][1]].id
#tag_id = cache.Tag[ngram[0][1]].id
#tag_id = cache.Tag['NN'].id
#tag_id = cache.Tag['NN'].id
#tag_id = 14
#tag_id = 14
...
@@ -255,7 +265,7 @@ def extract_ngrams(corpus, keys):
...
@@ -255,7 +265,7 @@ def extract_ngrams(corpus, keys):
node_ngram_list
[
node_id
][
terms
]
+=
1
node_ngram_list
[
node_id
][
terms
]
+=
1
ngrams_data
.
add
((
terms
[:
255
],
n
))
ngrams_data
.
add
((
terms
[:
255
],
n
))
ngrams_language_data
.
add
((
terms
,
language_id
))
ngrams_language_data
.
add
((
terms
,
language_id
))
ngrams_tag_data
.
add
((
terms
,
tag_id
))
#
ngrams_tag_data.add((terms, tag_id))
# insert ngrams to temporary table
# insert ngrams to temporary table
dbg
.
show
(
'find ids for the
%
d ngrams'
%
len
(
ngrams_data
))
dbg
.
show
(
'find ids for the
%
d ngrams'
%
len
(
ngrams_data
))
...
@@ -263,12 +273,12 @@ def extract_ngrams(corpus, keys):
...
@@ -263,12 +273,12 @@ def extract_ngrams(corpus, keys):
ngram_ids
=
insert_ngrams
(
ngrams_data
)
ngram_ids
=
insert_ngrams
(
ngrams_data
)
dbg
.
show
(
'insert associations'
)
dbg
.
show
(
'insert associations'
)
node_ngram_data
=
lis
t
()
node_ngram_data
=
se
t
()
for
node_id
,
ngrams
in
node_ngram_list
.
items
():
for
node_id
,
ngrams
in
node_ngram_list
.
items
():
for
terms
,
weight
in
ngrams
.
items
():
for
terms
,
weight
in
ngrams
.
items
():
try
:
try
:
ngram_id
=
ngram_ids
[
terms
]
ngram_id
=
ngram_ids
[
terms
]
node_ngram_data
.
a
ppen
d
((
node_id
,
ngram_id
,
weight
,
))
node_ngram_data
.
a
d
d
((
node_id
,
ngram_id
,
weight
,
))
except
Exception
as
e
:
except
Exception
as
e
:
print
(
"err01:"
,
e
)
print
(
"err01:"
,
e
)
bulk_insert
(
Node_Ngram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
node_ngram_data
,
cursor
=
cursor
)
bulk_insert
(
Node_Ngram
,
[
'node_id'
,
'ngram_id'
,
'weight'
],
node_ngram_data
,
cursor
=
cursor
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment