Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
697e39ee
Commit
697e39ee
authored
Nov 23, 2015
by
delanoe
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'samuel' into unstable-merge
parents
2b0c0d2e
18b05727
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
218 additions
and
105 deletions
+218
-105
EnglishNgramsExtractor.py
parsing/NgramsExtractors/EnglishNgramsExtractor.py
+1
-1
NgramsExtractor.py
parsing/NgramsExtractors/NgramsExtractor.py
+2
-2
ngrams.py
rest_v1_0/ngrams.py
+63
-43
NGrams_dyna_chart_and_table.js
static/js/NGrams_dyna_chart_and_table.js
+104
-58
terms.html
templates/corpus/terms.html
+1
-1
chunked_selects.py
tests/ngramstable/chunked_selects.py
+47
-0
No files found.
parsing/NgramsExtractors/EnglishNgramsExtractor.py
View file @
697e39ee
...
...
@@ -7,4 +7,4 @@ class EnglishNgramsExtractor(NgramsExtractor):
def
start
(
self
):
# self.tagger = NltkTagger()
self
.
tagger
=
MeltTagger
(
language
=
'en'
)
\ No newline at end of file
parsing/NgramsExtractors/NgramsExtractor.py
View file @
697e39ee
from
..Taggers
import
Turbo
Tagger
from
..Taggers
import
Nltk
Tagger
import
nltk
...
...
@@ -21,7 +21,7 @@ class NgramsExtractor:
self
.
stop
()
def
start
(
self
):
self
.
tagger
=
Turbo
Tagger
()
self
.
tagger
=
Nltk
Tagger
()
def
stop
(
self
):
pass
...
...
rest_v1_0/ngrams.py
View file @
697e39ee
...
...
@@ -23,6 +23,7 @@ from gargantext_web.db import session, Node, NodeNgram, NodeNgramNgram\
def
DebugHttpResponse
(
data
):
return
HttpResponse
(
'<html><body style="background:#000;color:#FFF"><pre>
%
s</pre></body></html>'
%
(
str
(
data
),
))
import
time
import
json
class
JSONEncoder
(
json
.
JSONEncoder
):
def
default
(
self
,
obj
):
...
...
@@ -78,20 +79,65 @@ from rest_framework.decorators import api_view
class
List
(
APIView
):
def
get_metadata
(
self
,
ngram_ids
,
parent_id
):
start_
=
time
.
time
()
nodes_ngrams
=
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
)
.
filter
(
Ngram
.
id
.
in_
(
list
(
ngram_ids
.
keys
())))
.
all
()
for
node
in
nodes_ngrams
:
if
node
.
id
in
ngram_ids
:
ngram_ids
[
node
.
id
]
=
{
"id"
:
node
.
id
,
"name"
:
node
.
terms
,
"scores"
:
{
"tfidf"
:
0
}
}
tfidf_list
=
get_or_create_node
(
nodetype
=
'Tfidf (global)'
,
corpus_id
=
parent_id
)
.
id
ngram_tfidf
=
session
.
query
(
NodeNodeNgram
.
ngram_id
,
NodeNodeNgram
.
score
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
tfidf_list
,
NodeNodeNgram
.
ngram_id
.
in_
(
list
(
ngram_ids
.
keys
())
))
.
all
()
for
n
in
ngram_tfidf
:
if
n
.
ngram_id
in
ngram_ids
:
ngram_ids
[
n
.
ngram_id
][
"scores"
][
"tfidf"
]
+=
n
.
score
end_
=
time
.
time
()
return
{
"data"
:
ngram_ids
,
"secs"
:(
end_
-
start_
)
}
def
get
(
self
,
request
,
corpus_id
,
list_name
):
start_
=
time
.
time
()
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
list_name
=
list_name
.
title
()
+
"List"
node_list
=
get_or_create_node
(
nodetype
=
list_name
,
corpus
=
corpus
)
nodes_ngrams
=
session
.
query
(
NodeNgram
)
.
filter
(
NodeNgram
.
node_id
==
node_list
.
id
)
.
all
()
nodes_ngrams
=
session
.
query
(
NodeNgram
.
ngram_id
)
.
filter
(
NodeNgram
.
node_id
==
node_list
.
id
)
.
all
()
ngram_ids
=
{}
for
node
in
nodes_ngrams
:
ngram_ids
[
node
.
ngram_id
]
=
True
ngrams
=
[
int
(
i
)
for
i
in
list
(
ngram_ids
.
keys
())]
# ngram_ids = get_occtfidf( ngrams , request.user.id , corpus_id , list_name)
return
JsonHttpResponse
(
{
"data"
:
ngram_ids
}
)
end_
=
time
.
time
()
measurements
=
{
"get_ngram_ids"
:
{
"s"
:(
end_
-
start_
),
"n"
:
len
(
ngram_ids
.
keys
())
}
}
if
request
.
GET
.
get
(
'custom'
,
False
)
!=
False
:
ngrams_meta
=
self
.
get_metadata
(
ngram_ids
,
corpus_id
)
ngram_ids
=
ngrams_meta
[
"data"
]
measurements
[
"tfidf"
]
=
{
"s"
:
ngrams_meta
[
"secs"
],
"n"
:
len
(
ngrams_meta
[
"data"
]
.
keys
())
}
return
JsonHttpResponse
(
{
"data"
:
ngram_ids
,
"time"
:
measurements
}
)
class
Ngrams
(
APIView
):
'''
...
...
@@ -100,40 +146,24 @@ class Ngrams(APIView):
http://localhost:8000/api/node/1444485/ngrams?format=json&score=tfidf,occs
'''
def
get
(
self
,
request
,
node_id
):
# print("\tCORPUS:",node_id," LIST:",request.GET.get('list', False) , " SCORES:",request.GET.get('score', False))
# query ngrams
start_
=
time
.
time
()
ParentNode
=
aliased
(
Node
)
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
node_id
)
.
first
()
group_by
=
[]
results
=
[
'id'
,
'terms'
]
# print("\t\tSTEP 01","\tCORPUS:",node_id," LIST:",request.GET.get('list', False) , " SCORES:",request.GET.get('score', False))
ngrams_query
=
(
session
.
query
(
Ngram
.
id
,
Ngram
.
terms
)
.
join
(
Node_Ngram
,
Node_Ngram
.
ngram_id
==
Ngram
.
id
)
.
join
(
Node
,
Node
.
id
==
Node_Ngram
.
node_id
)
)
# print("\t\tSTEP 02","\tCORPUS:",node_id," LIST:",request.GET.get('list', False) , " SCORES:",request.GET.get('score', False))
the_score
=
"tfidf"
if
request
.
GET
.
get
(
'score'
,
False
)
!=
False
:
the_score
=
request
.
GET
[
'score'
]
# # get the scores
# print( je peux pas prenez les ngrams occs avec l'aliased et get_or_create_node )
# if 'occs' in the_score:
# print("OOOOOOOCCCSSSS:")
# miamlist = session.query(Node).filter(Node.user_id == request.user.id , Node.parent_id==node_id , Node.type_id == cache.NodeType['MiamList'].id ).first()
# print( miamlist )
# Miam = aliased(NodeNgram)
# ngrams_query = ( session.query(NodeNgram.ngram_id, func.sum(NodeNgram.weight))
# .join(Node, Node.id == NodeNgram.node_id)
# .join(Miam, Miam.ngram_id == NodeNgram.ngram_id)
# .filter(Node.parent_id == node_id, Node.type_id==cache.NodeType['Document'].id)
# .filter(Miam.node_id==miamlist.id)
# .group_by(NodeNgram.ngram_id)
# .all()
# )
# for i in ngrams_query:
# print(i)
if
'occs'
in
the_score
:
occs
=
func
.
sum
(
Node_Ngram
.
weight
)
.
label
(
'occs'
)
ngrams_query
=
(
ngrams_query
.
add_column
(
occs
))
...
...
@@ -169,7 +199,6 @@ class Ngrams(APIView):
group_by
.
append
(
Spec
.
score
)
results
.
append
(
'specificity'
)
# print("\t\tSTEP 03","\tCORPUS:",node_id," LIST:",request.GET.get('list', False) , " SCORES:",request.GET.get('score', False))
order_query
=
request
.
GET
.
get
(
'order'
,
False
)
if
order_query
==
'occs'
:
ngrams_query
=
ngrams_query
.
order_by
(
desc
(
occs
))
...
...
@@ -180,7 +209,6 @@ class Ngrams(APIView):
elif
order_query
==
'specificity'
:
ngrams_query
=
ngrams_query
.
order_by
(
desc
(
Spec
.
score
))
# print("\t\tSTEP 04","\tCORPUS:",node_id," LIST:",request.GET.get('list', False) , " SCORES:",request.GET.get('score', False))
offset
=
int
(
request
.
GET
.
get
(
'offset'
,
0
))
limit
=
int
(
request
.
GET
.
get
(
'limit'
,
20
))
...
...
@@ -188,7 +216,6 @@ class Ngrams(APIView):
.
group_by
(
Ngram
.
id
,
Ngram
.
terms
,
*
group_by
)
)
# print("\t\tSTEP 05","\tCORPUS:",node_id," LIST:",request.GET.get('list', False) , " SCORES:",request.GET.get('score', False))
if
request
.
GET
.
get
(
'ngram_id'
,
False
)
!=
False
:
ngram_id
=
int
(
request
.
GET
[
'ngram_id'
])
Group
=
aliased
(
NodeNgramNgram
)
...
...
@@ -198,8 +225,6 @@ class Ngrams(APIView):
.
filter
(
Group
.
ngramx_id
==
ngram_id
)
)
# print("\t\tSTEP 06","\tCORPUS:",node_id," LIST:",request.GET.get('list', False) , " SCORES:",request.GET.get('score', False))
# # filters by list type (soon list_id to factorize it in javascript)
list_query
=
request
.
GET
.
get
(
'list'
,
'miam'
)
list_id
=
request
.
GET
.
get
(
'list_id'
,
False
)
if
list_query
==
'miam'
:
...
...
@@ -215,7 +240,6 @@ class Ngrams(APIView):
.
filter
(
Stop
.
node_id
==
stop_id
)
)
elif
list_query
==
'map'
:
# ngram could be in ngramx_id or ngramy_id
CoocX
=
aliased
(
NodeNgramNgram
)
CoocY
=
aliased
(
NodeNgramNgram
)
cooc_id
=
get_or_create_node
(
nodetype
=
'Cooccurrence'
,
corpus
=
corpus
)
.
id
...
...
@@ -241,15 +265,6 @@ class Ngrams(APIView):
.
filter
(
CoocY
.
node_id
==
node
.
id
)
)
# print("\t\tSTEP 07","\tCORPUS:",node_id," LIST:",request.GET.get('list', False) , " SCORES:",request.GET.get('score', False))
# print("")
# print(ngrams_query)
# total = ngrams_query.count()
# print("")
# print("\t\tSTEP 07.1, count:", total ,"\tCORPUS:",node_id," LIST:",request.GET.get('list', False) , " SCORES:",request.GET.get('score', False))
# print("\t\tSTEP 07.2, i:", offset , ", N:", (offset+limit) ,"\tCORPUS:",node_id," LIST:",request.GET.get('list', False) , " SCORES:",request.GET.get('score', False))
output
=
[]
for
ngram
in
ngrams_query
[
offset
:
offset
+
limit
]:
info
=
{
"scores"
:
{}
}
...
...
@@ -268,17 +283,22 @@ class Ngrams(APIView):
output
.
append
(
info
)
# print("\t\tSTEP 08","\tCORPUS:",node_id," LIST:",request.GET.get('list', False) , " SCORES:",request.GET.get('score', False))
end_
=
time
.
time
()
measurements
=
{
"s"
:(
end_
-
start_
),
"n"
:
len
(
output
)
}
# return formatted result
return
JsonHttpResponse
({
'pagination'
:
{
'offset'
:
offset
,
'limit'
:
limit
,
'total'
:
len
(
output
),
},
},
'data'
:
output
,
})
"time"
:
measurements
})
def
post
(
self
,
request
,
node_id
):
return
JsonHttpResponse
([
"POST"
,
"ok"
])
...
...
static/js/NGrams_dyna_chart_and_table.js
View file @
697e39ee
...
...
@@ -897,10 +897,10 @@ function GET_( url , callback ) {
// [ = = = = = = = = = = INIT = = = = = = = = = = ]
// http://localhost:8000/api/node/84592/ngrams?format=json&score=tfidf,occs&list=miam
var
corpus_id
=
getIDFromURL
(
"corpus"
)
var
url0
=
window
.
location
.
origin
+
"/api/node/"
+
corpus_id
+
"/ngrams?format=json&score=tfidf,occs&list=stop&limit=1000"
,
url1
=
window
.
location
.
origin
+
"/api/node/"
+
corpus_id
+
"/ngrams/group"
,
url2
=
window
.
location
.
origin
+
"/api/node/"
+
corpus_id
+
"/ngrams/list/map"
,
url3
=
window
.
location
.
origin
+
"/api/node/"
+
corpus_id
+
"/ngrams?format=json&score=tfidf,occs&list=miam&limit=1000"
;
//
var url0=window.location.origin+"/api/node/"+corpus_id+"/ngrams?format=json&score=tfidf,occs&list=stop&limit=1000",
//
url1=window.location.origin+"/api/node/"+corpus_id+"/ngrams/group",
//
url2=window.location.origin+"/api/node/"+corpus_id+"/ngrams/list/map",
//
url3=window.location.origin+"/api/node/"+corpus_id+"/ngrams?format=json&score=tfidf,occs&list=miam&limit=1000";
var
NGrams
=
{
"group"
:
{},
"stop"
:
{},
...
...
@@ -911,71 +911,120 @@ var NGrams = {
$
(
"#corpusdisplayer"
).
hide
()
// // The AJAX's in cascade:
// GET_( url0 , function(result) {
// if(result!=false) {
// for(var i in result) {
// NGrams["stop"][result[i].id] = result[i]
// }
// }
// GET_( url1 , function(result) {
// if(result!=false) {
// NGrams["group"] = result
// }
// GET_( url2 , function(result) {
// if(result!=false) {
// NGrams["map"] = result
// }
// GET_( url3 , function(result) {
// if(result!=false) {
// NGrams["main"] = {
// "ngrams": result,
// "scores": {
// "initial":"tfidf",
// "nb_docs":result.length,
// "orig_nb_ngrams":1,
// "nb_ngrams":result.length,
// }
// }
// AfterAjax()
// }
// });
// });
// });
// });
var
url
=
[
window
.
location
.
origin
+
"/api/node/"
+
corpus_id
+
"/ngrams/list/map?custom"
,
window
.
location
.
origin
+
"/api/node/"
+
corpus_id
+
"/ngrams/group"
,
window
.
location
.
origin
+
"/api/node/"
+
corpus_id
+
"/ngrams?format=json&score=tfidf,occs&list=stop&limit=1000"
,
]
// The AJAX's in cascade:
GET_
(
url
0
,
function
(
result
)
{
GET_
(
url
[
0
]
,
function
(
result
)
{
if
(
result
!=
false
)
{
NGrams
[
"main"
]
=
{
"ngrams"
:
[],
"scores"
:
{
"initial"
:
"tfidf"
,
"nb_docs"
:
result
.
length
,
"orig_nb_ngrams"
:
1
,
"nb_ngrams"
:
result
.
length
,
}
}
var
counter
=
0
for
(
var
i
in
result
)
{
NGrams
[
"stop"
][
result
[
i
].
id
]
=
result
[
i
]
NGrams
[
"map"
][
result
[
i
].
id
]
=
true
NGrams
[
"main"
].
ngrams
.
push
(
result
[
i
])
NGrams
[
"main"
].
ngrams
[
counter
][
"state"
]
=
System
[
0
][
"statesD"
][
"keep"
]
counter
++
;
}
console
.
log
(
NGrams
[
"main"
])
AfterAjax
()
}
GET_
(
url
1
,
function
(
result
)
{
GET_
(
url
[
1
]
,
function
(
result
)
{
if
(
result
!=
false
)
{
NGrams
[
"group"
]
=
result
}
GET_
(
url2
,
function
(
result
)
{
if
(
result
!=
false
)
{
NGrams
[
"map"
]
=
result
}
GET_
(
url3
,
function
(
result
)
{
if
(
result
!=
false
)
{
NGrams
[
"main"
]
=
{
"ngrams"
:
result
,
"scores"
:
{
"initial"
:
"occ_uniq"
,
"nb_docs"
:
result
.
length
,
"orig_nb_ngrams"
:
1
,
"nb_ngrams"
:
result
.
length
,
}
}
AfterAjax
()
}
});
GET_
(
url
[
2
]
,
function
(
result
)
{
for
(
var
i
in
result
)
{
NGrams
[
"stop"
][
result
[
i
].
id
]
=
result
[
i
]
}
});
});
});
function
AfterAjax
()
{
// Deleting subforms from the ngrams-table, clean start baby!
if
(
Object
.
keys
(
NGrams
[
"group"
].
links
).
length
>
0
)
{
var
_forms
=
{
"main"
:{}
,
"sub"
:{}
}
for
(
var
i
in
NGrams
[
"group"
].
links
)
{
_forms
[
"main"
][
i
]
=
true
for
(
var
j
in
NGrams
[
"group"
].
links
[
i
])
{
_forms
[
"sub"
][
NGrams
[
"group"
].
links
[
i
][
j
]
]
=
true
}
}
var
ngrams_data_
=
[]
for
(
var
i
in
NGrams
[
"main"
].
ngrams
)
{
if
(
_forms
[
"sub"
][
NGrams
[
"main"
].
ngrams
[
i
].
id
])
{
NGrams
[
"group"
][
"nodes"
][
NGrams
[
"main"
].
ngrams
[
i
].
id
]
=
NGrams
[
"main"
].
ngrams
[
i
]
}
else
{
// if( _forms["main"][ NGrams["main"].ngrams[i].id ] )
// NGrams["main"].ngrams[i].name = "*"+NGrams["main"].ngrams[i].name
ngrams_data_
.
push
(
NGrams
[
"main"
].
ngrams
[
i
]
)
}
}
NGrams
[
"main"
].
ngrams
=
ngrams_data_
;
}
if
(
Object
.
keys
(
NGrams
[
"map"
]).
length
>
0
)
{
for
(
var
i
in
NGrams
[
"main"
].
ngrams
)
{
if
(
NGrams
[
"map"
][
NGrams
[
"main"
].
ngrams
[
i
].
id
])
{
NGrams
[
"main"
].
ngrams
[
i
][
"state"
]
=
System
[
0
][
"statesD"
][
"keep"
]
}
}
}
function
AfterAjax
()
{
// // Deleting subforms from the ngrams-table, clean start baby!
// if( Object.keys(NGrams["group"].links).length>0 ) {
// var _forms = { "main":{} , "sub":{} }
// for(var i in NGrams["group"].links) {
// _forms["main"][i] = true
// for(var j in NGrams["group"].links[i]) {
// _forms["sub"][ NGrams["group"].links[i][j] ] = true
// }
// }
// var ngrams_data_ = []
// for(var i in NGrams["main"].ngrams) {
// if(_forms["sub"][NGrams["main"].ngrams[i].id]) {
// NGrams["group"]["nodes"][NGrams["main"].ngrams[i].id] = NGrams["main"].ngrams[i]
// } else {
// // if( _forms["main"][ NGrams["main"].ngrams[i].id ] )
// // NGrams["main"].ngrams[i].name = "*"+NGrams["main"].ngrams[i].name
// ngrams_data_.push( NGrams["main"].ngrams[i] )
// }
// }
// NGrams["main"].ngrams = ngrams_data_;
// }
// if( Object.keys(NGrams["map"]).length>0 ) {
// for(var i in NGrams["main"].ngrams) {
// if(NGrams["map"][NGrams["main"].ngrams[i].id]) {
// NGrams["main"].ngrams[i]["state"] = System[0]["statesD"]["keep"]
// }
// }
// }
// Building the Score-Selector //NGrams["scores"]
var
FirstScore
=
NGrams
[
"main"
].
scores
.
initial
...
...
@@ -1006,6 +1055,3 @@ function AfterAjax() {
$
(
"#content_loader"
).
remove
()
$
(
"#corpusdisplayer"
).
click
()
}
templates/corpus/terms.html
View file @
697e39ee
...
...
@@ -233,7 +233,7 @@ input[type=radio]:checked + label {
<div
id=
"filter_search"
style=
"visibility:hidden"
>
<select
id=
"example-single-optgroups"
onchange=
"SearchFilters(this);"
>
<!-- <optgroup label=""> -->
<
option
id=
"filter_all"
value=
"filter_all"
>
All
</option
>
<
!-- <option id="filter_all" value="filter_all">All</option> --
>
<!-- <option id="filter_title" value="filter_title">Title</option> -->
<!-- <option id="filter_date" value="filter_date">Date</option> -->
<!-- </optgroup> -->
...
...
tests/ngramstable/chunked_selects.py
0 → 100644
View file @
697e39ee
import
threading
from
queue
import
Queue
# import time
import
random
from
gargantext_web.db
import
session
,
Node_Ngram
class
ChunkedSELECTS
:
def
__init__
(
self
):
self
.
q
=
Queue
()
self
.
firstResults
=
[]
self
.
lock
=
threading
.
Lock
()
# lock to serialize console output
self
.
ngrams_dict
=
{}
def
worker_sql_action
(
self
,
docs_list
):
data
=
{}
for
d
in
docs_list
:
# this_ngrams = session.query(Node_Ngram.ngram_id).filter( Node_Ngram.node_id==d).all()
this_ngrams
=
session
.
query
(
Node_Ngram
.
ngram_id
,
Node_Ngram
.
weight
)
.
filter
(
Node_Ngram
.
node_id
==
d
)
.
all
()
filtered_ngrams
=
[]
for
n
in
this_ngrams
:
if
n
[
0
]
in
self
.
ngrams_dict
:
# filtered_ngrams.append( n[0] )
filtered_ngrams
.
append
(
[
n
[
0
]
,
int
(
n
[
1
])
]
)
data
[
d
]
=
filtered_ngrams
with
self
.
lock
:
# print(threading.current_thread().name, str(len(docs_list))+" OK")
return
data
def
worker_sql
(
self
):
while
True
:
item
=
self
.
q
.
get
()
results
=
[]
try
:
result
=
self
.
worker_sql_action
(
item
)
except
:
result
=
False
self
.
firstResults
.
append
(
result
)
self
.
q
.
task_done
()
def
chunks
(
self
,
l
,
n
):
for
i
in
range
(
0
,
len
(
l
),
n
):
yield
l
[
i
:
i
+
n
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment