Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
66318619
Commit
66318619
authored
Jun 02, 2015
by
PkSM3
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[UPDATE] ngramstable separation
parent
86c289b4
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
263 additions
and
170 deletions
+263
-170
urls.py
gargantext_web/urls.py
+4
-2
views.py
gargantext_web/views.py
+0
-168
InterUnion.py
tests/ngramstable/InterUnion.py
+37
-0
__init__.py
tests/ngramstable/__init__.py
+0
-0
models.py
tests/ngramstable/models.py
+3
-0
views.py
tests/ngramstable/views.py
+219
-0
No files found.
gargantext_web/urls.py
View file @
66318619
...
...
@@ -8,6 +8,8 @@ from gargantext_web import views, views_optimized
import
gargantext_web.api
import
scrappers.scrap_pubmed.views
as
pubmedscrapper
import
tests.ngramstable.views
as
samtest
admin
.
autodiscover
()
...
...
@@ -78,8 +80,8 @@ urlpatterns = patterns('',
url
(
r'^tests/project/(\d+)/ISTEXquery/go$'
,
pubmedscrapper
.
testISTEX
),
url
(
r'^tests/paginator/corpus/(\d+)/$'
,
views
.
newpaginatorJSON
),
url
(
r'^tests/move2trash/$'
,
views
.
move_to_trash_multiple
),
url
(
r'^project/(\d+)/corpus/(\d+)/ngrams
$'
,
views
.
get_ngrams
),
url
(
r'^project/(\d+)/corpus/(\d+)/ngrams/ngrams.json$'
,
views
.
test_ngrams
)
url
(
r'^project/(\d+)/corpus/(\d+)/ngrams
/ngrams.json$'
,
samtest
.
test_ngrams
)
# url(r'^project/(\d+)/corpus/(\d+)/ngrams$', views.get_ngrams),
)
...
...
gargantext_web/views.py
View file @
66318619
...
...
@@ -255,174 +255,6 @@ def projects(request):
'projects'
:
projects
})
def
get_ngrams
(
request
,
project_id
,
corpus_id
):
if
not
request
.
user
.
is_authenticated
():
return
redirect
(
'/login/?next=
%
s'
%
request
.
path
)
try
:
offset
=
int
(
project_id
)
offset
=
int
(
corpus_id
)
except
ValueError
:
raise
Http404
()
t
=
get_template
(
'tests/ngrams.html'
)
user
=
cache
.
User
[
request
.
user
.
username
]
.
id
date
=
datetime
.
datetime
.
now
()
project
=
cache
.
Node
[
int
(
project_id
)]
corpus
=
cache
.
Node
[
int
(
corpus_id
)]
type_doc_id
=
cache
.
NodeType
[
'Document'
]
.
id
number
=
session
.
query
(
func
.
count
(
Node
.
id
))
.
filter
(
Node
.
parent_id
==
corpus_id
,
Node
.
type_id
==
type_doc_id
)
.
all
()[
0
][
0
]
try
:
processing
=
corpus
.
hyperdata
[
'Processing'
]
except
Exception
as
error
:
print
(
error
)
processing
=
0
html
=
t
.
render
(
Context
({
'debug'
:
settings
.
DEBUG
,
'user'
:
user
,
'date'
:
date
,
'project'
:
project
,
'corpus'
:
corpus
,
'processing'
:
processing
,
'number'
:
number
,
}))
return
HttpResponse
(
html
)
def
test_ngrams
(
request
,
project_id
,
corpus_id
):
results
=
[
"hola"
,
"mundo"
]
user_id
=
request
.
user
.
id
whitelist_type_id
=
cache
.
NodeType
[
'WhiteList'
]
.
id
document_type_id
=
cache
.
NodeType
[
'Document'
]
.
id
# # 13099 clinical benefits
# # 7492 recent data
# # 14279 brain development
# # 50681 possible cause
# # 47111 psychological symptoms
# # 3944 common form
# ngram_of_interest = 14279
# documents = session.query(Node).filter(Node.user_id == user_id , Node.parent_id==corpus_id , Node.type_id == document_type_id ).all()
# to_print = []
# for doc in documents:
# NgramOccs = session.query(Node_Ngram).filter( Node_Ngram.node_id==doc.id).all()
# # print( len(NgramOccs) )
# for ngram in NgramOccs:
# if ngram.ngram_id == ngram_of_interest:
# to_print.append( [doc.id,doc.name] )
# break
# if len(to_print)>0:
# for doc in to_print:
# doc_id = doc[0]
# doc_name = doc[1]
# print("doc_id:",doc_id)
# NgramOccs = session.query(Node_Ngram).filter( Node_Ngram.node_id==doc_id).all()
# for ngram in NgramOccs:
# if ngram.ngram_id == ngram_of_interest:
# print("\t" , ngram.ngram_id , "\t" , ngram.weight )
# print (" - - - - -- - - - ")
# print("Calculation using the DB:")
# white_list = session.query(Node).filter( Node.parent_id==corpus_id , Node.type_id==whitelist_type_id).first()
# NgramOccs = session.query(Node_Ngram).filter( Node_Ngram.node_id==white_list.id).all()
# for ngram in NgramOccs:
# if ngram.ngram_id == ngram_of_interest:
# print( ngram.weight, "\t" , ngram.ngram_id)
# print( "= = = = = = = = == = = ")
# NgramTFIDF = session.query(NodeNodeNgram).filter( NodeNodeNgram.nodex_id==corpus_id ).all()
# for ngram in NgramTFIDF:
# print( "docid:", ngram.nodey_id , ngram.ngram_id , ngram.score)
Ngrams_Scores
=
{}
## < Getting the Effective nro de OCCS ##
documents
=
session
.
query
(
Node
)
.
filter
(
Node
.
user_id
==
user_id
,
Node
.
parent_id
==
corpus_id
,
Node
.
type_id
==
document_type_id
)
.
all
()
for
doc
in
documents
:
NgramOccs
=
session
.
query
(
Node_Ngram
)
.
filter
(
Node_Ngram
.
node_id
==
doc
.
id
)
.
all
()
for
ngram
in
NgramOccs
:
if
ngram
.
ngram_id
not
in
Ngrams_Scores
:
Ngrams_Scores
[
ngram
.
ngram_id
]
=
{}
Ngrams_Scores
[
ngram
.
ngram_id
][
"scores"
]
=
{
"occ_sum"
:
0.0
,
"occ_uniq"
:
0.0
,
"tfidf_sum"
:
0.0
}
Ngrams_Scores
[
ngram
.
ngram_id
][
"scores"
][
"occ_sum"
]
+=
ngram
.
weight
Ngrams_Scores
[
ngram
.
ngram_id
][
"scores"
][
"occ_uniq"
]
+=
1
# print("\t" , ngram.ngram_id , "\t" , ngram.weight )
## Getting the Effective nro de OCCS / >##
# # CA MARCHE PAS POUR TOUT LES NGRAMS!!
# ## < Getting the unique number of OCCS ##
# summ1 = len(Ngrams_Scores.keys())
# white_list = session.query(Node).filter( Node.parent_id==corpus_id , Node.type_id==whitelist_type_id).first()# get whitelist id from corpus
# NgramOccs = session.query(Node_Ngram).filter( Node_Ngram.node_id==white_list.id).all()
# summ2 = 0
# for ngram in NgramOccs:
# Ngrams_Scores[ngram.ngram_id]["occ_uniq"] = ngram.weight
# summ2+=1
# # print("\t" , ngram.ngram_id , "\t" , ngram.weight )
# print (" - - -- - - - - - ")
# print ("Sum numero 01:",summ1)
# print ("Sum numero 02:",summ2)
# ## Getting the unique number of OCCS /> ##
NgramTFIDF
=
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
corpus_id
)
.
all
()
for
ngram
in
NgramTFIDF
:
Ngrams_Scores
[
ngram
.
ngram_id
][
"scores"
][
"tfidf_sum"
]
+=
ngram
.
score
# print( "docid:", ngram.nodey_id , ngram.ngram_id , ngram.score)
# import pprint
# pprint.pprint( Ngrams_Scores )
# # select * from node_nodenodengram where ngram_id=14279;
# NodeNodeNgram
# nodex_id = real corpus id
# nodey_id = document id
# ngram_id = duh
# id | nodex_id | nodey_id | ngram_id | score
ngrams_ids
=
Ngrams_Scores
.
keys
()
Metrics
=
{
"ngrams"
:[],
"scores"
:
{
"nb_docs"
:
len
(
documents
),
"nb_ngrams"
:
len
(
ngrams_ids
)
}
}
query
=
session
.
query
(
Ngram
)
.
filter
(
Ngram
.
id
.
in_
(
ngrams_ids
))
ngrams_data
=
query
.
all
()
for
ngram
in
ngrams_data
:
Ngrams_Scores
[
ngram
.
id
][
"name"
]
=
ngram
.
terms
Ngrams_Scores
[
ngram
.
id
][
"id"
]
=
ngram
.
id
Metrics
[
"ngrams"
]
.
append
(
Ngrams_Scores
[
ngram
.
id
]
)
return
JsonHttpResponse
(
Metrics
)
def
corpus
(
request
,
project_id
,
corpus_id
):
if
not
request
.
user
.
is_authenticated
():
return
redirect
(
'/login/?next=
%
s'
%
request
.
path
)
...
...
tests/ngramstable/InterUnion.py
0 → 100644
View file @
66318619
import
networkx
as
nx
from
itertools
import
combinations
class
Utils
:
def
__init__
(
self
):
self
.
G
=
nx
.
Graph
()
def
unique
(
self
,
a
):
""" return the list with duplicate elements removed """
return
list
(
set
(
a
))
def
intersect
(
self
,
a
,
b
):
""" return the intersection of two lists """
return
list
(
set
(
a
)
&
set
(
b
))
def
union
(
self
,
a
,
b
):
""" return the union of two lists """
return
list
(
set
(
a
)
|
set
(
b
))
def
addCompleteSubGraph
(
self
,
terms
):
G
=
self
.
G
# <addnode> #
for
i
in
terms
:
G
.
add_node
(
i
)
# </addnode> #
# <addedge> #
edges
=
combinations
(
terms
,
2
)
for
n
in
edges
:
n1
=
n
[
0
]
n2
=
n
[
1
]
one
=
float
(
1
)
if
G
.
has_edge
(
n1
,
n2
):
G
[
n1
][
n2
][
'weight'
]
+=
one
else
:
G
.
add_edge
(
n1
,
n2
,
weight
=
one
)
self
.
G
=
G
\ No newline at end of file
tests/ngramstable/__init__.py
0 → 100644
View file @
66318619
tests/ngramstable/models.py
0 → 100644
View file @
66318619
from
django.db
import
models
# Create your models here.
tests/ngramstable/views.py
0 → 100644
View file @
66318619
from
django.shortcuts
import
redirect
from
django.shortcuts
import
render
from
django.db
import
transaction
from
django.http
import
Http404
,
HttpResponse
,
HttpResponseRedirect
,
HttpResponseForbidden
from
django.template.loader
import
get_template
from
django.template
import
Context
from
node
import
models
#from node.models import Language, ResourceType, Resource, \
# Node, NodeType, Node_Resource, Project, Corpus, \
# Ngram, Node_Ngram, NodeNgramNgram, NodeNodeNgram
from
node.admin
import
CorpusForm
,
ProjectForm
,
ResourceForm
,
CustomForm
from
django.contrib.auth.models
import
User
import
datetime
from
itertools
import
*
from
dateutil.parser
import
parse
from
django.db
import
connection
from
django
import
forms
from
collections
import
defaultdict
from
parsing.FileParsers
import
*
import
os
import
json
# SOME FUNCTIONS
from
gargantext_web
import
settings
from
django.http
import
*
from
django.shortcuts
import
render_to_response
,
redirect
from
django.template
import
RequestContext
from
django.contrib.auth.decorators
import
login_required
from
django.contrib.auth
import
authenticate
,
login
,
logout
from
scrappers.scrap_pubmed.admin
import
Logger
from
gargantext_web.db
import
*
from
sqlalchemy
import
or_
,
func
from
gargantext_web
import
about
from
gargantext_web.api
import
JsonHttpResponse
def
get_ngrams
(
request
,
project_id
,
corpus_id
):
if
not
request
.
user
.
is_authenticated
():
return
redirect
(
'/login/?next=
%
s'
%
request
.
path
)
try
:
offset
=
int
(
project_id
)
offset
=
int
(
corpus_id
)
except
ValueError
:
raise
Http404
()
t
=
get_template
(
'tests/ngrams.html'
)
user
=
cache
.
User
[
request
.
user
.
username
]
.
id
date
=
datetime
.
datetime
.
now
()
project
=
cache
.
Node
[
int
(
project_id
)]
corpus
=
cache
.
Node
[
int
(
corpus_id
)]
type_doc_id
=
cache
.
NodeType
[
'Document'
]
.
id
number
=
session
.
query
(
func
.
count
(
Node
.
id
))
.
filter
(
Node
.
parent_id
==
corpus_id
,
Node
.
type_id
==
type_doc_id
)
.
all
()[
0
][
0
]
try
:
processing
=
corpus
.
hyperdata
[
'Processing'
]
except
Exception
as
error
:
print
(
error
)
processing
=
0
html
=
t
.
render
(
Context
({
'debug'
:
settings
.
DEBUG
,
'user'
:
user
,
'date'
:
date
,
'project'
:
project
,
'corpus'
:
corpus
,
'processing'
:
processing
,
'number'
:
number
,
}))
return
HttpResponse
(
html
)
def
test_ngrams
(
request
,
project_id
,
corpus_id
):
results
=
[
"hola"
,
"mundo"
]
user_id
=
request
.
user
.
id
whitelist_type_id
=
cache
.
NodeType
[
'WhiteList'
]
.
id
document_type_id
=
cache
.
NodeType
[
'Document'
]
.
id
# # 13099 clinical benefits
# # 7492 recent data
# # 14279 brain development
# # 50681 possible cause
# # 47111 psychological symptoms
# # 3944 common form
# ngram_of_interest = 14279
# documents = session.query(Node).filter(Node.user_id == user_id , Node.parent_id==corpus_id , Node.type_id == document_type_id ).all()
# to_print = []
# for doc in documents:
# NgramOccs = session.query(Node_Ngram).filter( Node_Ngram.node_id==doc.id).all()
# # print( len(NgramOccs) )
# for ngram in NgramOccs:
# if ngram.ngram_id == ngram_of_interest:
# to_print.append( [doc.id,doc.name] )
# break
# if len(to_print)>0:
# for doc in to_print:
# doc_id = doc[0]
# doc_name = doc[1]
# print("doc_id:",doc_id)
# NgramOccs = session.query(Node_Ngram).filter( Node_Ngram.node_id==doc_id).all()
# for ngram in NgramOccs:
# if ngram.ngram_id == ngram_of_interest:
# print("\t" , ngram.ngram_id , "\t" , ngram.weight )
# print (" - - - - -- - - - ")
# print("Calculation using the DB:")
# white_list = session.query(Node).filter( Node.parent_id==corpus_id , Node.type_id==whitelist_type_id).first()
# NgramOccs = session.query(Node_Ngram).filter( Node_Ngram.node_id==white_list.id).all()
# for ngram in NgramOccs:
# if ngram.ngram_id == ngram_of_interest:
# print( ngram.weight, "\t" , ngram.ngram_id)
# print( "= = = = = = = = == = = ")
# NgramTFIDF = session.query(NodeNodeNgram).filter( NodeNodeNgram.nodex_id==corpus_id ).all()
# for ngram in NgramTFIDF:
# print( "docid:", ngram.nodey_id , ngram.ngram_id , ngram.score)
Ngrams_Scores
=
{}
## < Getting the Effective nro de OCCS ##
documents
=
session
.
query
(
Node
)
.
filter
(
Node
.
user_id
==
user_id
,
Node
.
parent_id
==
corpus_id
,
Node
.
type_id
==
document_type_id
)
.
all
()
for
doc
in
documents
:
NgramOccs
=
session
.
query
(
Node_Ngram
)
.
filter
(
Node_Ngram
.
node_id
==
doc
.
id
)
.
all
()
for
ngram
in
NgramOccs
:
if
ngram
.
ngram_id
not
in
Ngrams_Scores
:
Ngrams_Scores
[
ngram
.
ngram_id
]
=
{}
Ngrams_Scores
[
ngram
.
ngram_id
][
"scores"
]
=
{
"occ_sum"
:
0.0
,
"occ_uniq"
:
0.0
,
"tfidf_sum"
:
0.0
}
Ngrams_Scores
[
ngram
.
ngram_id
][
"scores"
][
"occ_sum"
]
+=
ngram
.
weight
Ngrams_Scores
[
ngram
.
ngram_id
][
"scores"
][
"occ_uniq"
]
+=
1
# print("\t" , ngram.ngram_id , "\t" , ngram.weight )
## Getting the Effective nro de OCCS / >##
# # CA MARCHE PAS POUR TOUT LES NGRAMS!!
# ## < Getting the unique number of OCCS ##
# summ1 = len(Ngrams_Scores.keys())
# white_list = session.query(Node).filter( Node.parent_id==corpus_id , Node.type_id==whitelist_type_id).first()# get whitelist id from corpus
# NgramOccs = session.query(Node_Ngram).filter( Node_Ngram.node_id==white_list.id).all()
# summ2 = 0
# for ngram in NgramOccs:
# Ngrams_Scores[ngram.ngram_id]["occ_uniq"] = ngram.weight
# summ2+=1
# # print("\t" , ngram.ngram_id , "\t" , ngram.weight )
# print (" - - -- - - - - - ")
# print ("Sum numero 01:",summ1)
# print ("Sum numero 02:",summ2)
# ## Getting the unique number of OCCS /> ##
NgramTFIDF
=
session
.
query
(
NodeNodeNgram
)
.
filter
(
NodeNodeNgram
.
nodex_id
==
corpus_id
)
.
all
()
for
ngram
in
NgramTFIDF
:
Ngrams_Scores
[
ngram
.
ngram_id
][
"scores"
][
"tfidf_sum"
]
+=
ngram
.
score
# print( "docid:", ngram.nodey_id , ngram.ngram_id , ngram.score)
# import pprint
# pprint.pprint( Ngrams_Scores )
# # select * from node_nodenodengram where ngram_id=14279;
# NodeNodeNgram
# nodex_id = real corpus id
# nodey_id = document id
# ngram_id = duh
# id | nodex_id | nodey_id | ngram_id | score
ngrams_ids
=
Ngrams_Scores
.
keys
()
Metrics
=
{
"ngrams"
:[],
"scores"
:
{
"nb_docs"
:
len
(
documents
),
"nb_ngrams"
:
len
(
ngrams_ids
)
}
}
query
=
session
.
query
(
Ngram
)
.
filter
(
Ngram
.
id
.
in_
(
ngrams_ids
))
ngrams_data
=
query
.
all
()
for
ngram
in
ngrams_data
:
Ngrams_Scores
[
ngram
.
id
][
"name"
]
=
ngram
.
terms
Ngrams_Scores
[
ngram
.
id
][
"id"
]
=
ngram
.
id
Metrics
[
"ngrams"
]
.
append
(
Ngrams_Scores
[
ngram
.
id
]
)
return
JsonHttpResponse
(
Metrics
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment