Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
082984a9
Commit
082984a9
authored
Oct 13, 2015
by
PkSM3
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'unstable' of
ssh://delanoe.org:1979/gargantext
into samuel
parents
6d73d2de
83d70c45
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
99 additions
and
43 deletions
+99
-43
cooccurrences.py
analysis/cooccurrences.py
+9
-5
functions.py
analysis/functions.py
+4
-2
celery.py
gargantext_web/celery.py
+18
-1
views.py
gargantext_web/views.py
+45
-23
cvalue.py
ngram/cvalue.py
+2
-2
group.py
ngram/group.py
+3
-2
tfidf.py
ngram/tfidf.py
+1
-2
workflow.py
ngram/workflow.py
+17
-6
No files found.
analysis/cooccurrences.py
View file @
082984a9
...
@@ -8,6 +8,8 @@ from gargantext_web.db import Node, Ngram, NodeNgram, NodeNgramNgram, \
...
@@ -8,6 +8,8 @@ from gargantext_web.db import Node, Ngram, NodeNgram, NodeNgramNgram, \
from
gargantext_web.db
import
session
,
cache
,
get_or_create_node
,
bulk_insert
from
gargantext_web.db
import
session
,
cache
,
get_or_create_node
,
bulk_insert
from
analysis.lists
import
WeightedMatrix
,
UnweightedList
,
Translations
from
analysis.lists
import
WeightedMatrix
,
UnweightedList
,
Translations
# keep list
def
cooc
(
corpus
=
None
def
cooc
(
corpus
=
None
,
field_X
=
None
,
field_Y
=
None
,
field_X
=
None
,
field_Y
=
None
,
miam_id
=
None
,
stop_id
=
None
,
group_id
=
None
,
miam_id
=
None
,
stop_id
=
None
,
group_id
=
None
...
@@ -51,10 +53,11 @@ def cooc(corpus=None
...
@@ -51,10 +53,11 @@ def cooc(corpus=None
NodeNgramX
=
aliased
(
NodeNgram
)
NodeNgramX
=
aliased
(
NodeNgram
)
NodeNgramY
=
aliased
(
NodeNgram
)
NodeNgramY
=
aliased
(
NodeNgram
)
cooc_score
=
func
.
sqrt
(
func
.
sum
(
NodeNgramX
.
weight
)
*
func
.
sum
(
NodeNgramY
.
weight
))
.
label
(
'cooc_score'
)
doc_id
=
cache
.
NodeType
[
'Document'
]
.
id
doc_id
=
cache
.
NodeType
[
'Document'
]
.
id
cooc_query
=
(
session
.
query
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
,
func
.
count
()
)
cooc_query
=
(
session
.
query
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
,
cooc_score
)
.
join
(
Node
,
Node
.
id
==
NodeNgramX
.
node_id
)
.
join
(
Node
,
Node
.
id
==
NodeNgramX
.
node_id
)
.
join
(
NodeNgramY
,
NodeNgramY
.
node_id
==
Node
.
id
)
.
join
(
NodeNgramY
,
NodeNgramY
.
node_id
==
Node
.
id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
type_id
==
doc_id
)
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
type_id
==
doc_id
)
...
@@ -104,13 +107,14 @@ def cooc(corpus=None
...
@@ -104,13 +107,14 @@ def cooc(corpus=None
# Cooc is symetric, take only the main cooccurrences and cut at the limit
# Cooc is symetric, take only the main cooccurrences and cut at the limit
cooc_query
=
(
cooc_query
.
filter
(
Node
.
parent_id
==
corpus
.
id
,
Node
.
type_id
==
doc_id
)
cooc_query
=
(
cooc_query
.
filter
(
NodeNgramX
.
ngram_id
<
NodeNgramY
.
ngram_id
)
.
filter
(
NodeNgramX
.
ngram_id
<
NodeNgramY
.
ngram_id
)
.
having
(
cooc_score
>
1
)
.
group_by
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
)
.
group_by
(
NodeNgramX
.
ngram_id
,
NodeNgramY
.
ngram_id
)
.
order_by
(
desc
(
func
.
count
()
))
.
order_by
(
desc
(
'cooc_score'
))
.
limit
(
limit
)
#.limit(50
)
)
)
matrix
=
WeightedMatrix
(
cooc_query
)
matrix
=
WeightedMatrix
(
cooc_query
)
...
...
analysis/functions.py
View file @
082984a9
...
@@ -229,10 +229,10 @@ def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=siz
...
@@ -229,10 +229,10 @@ def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=siz
#print(n)
#print(n)
#print(m)
#print(m)
nodes_included
=
3
00
#int(round(size/20,0))
nodes_included
=
2
00
#int(round(size/20,0))
#nodes_excluded = int(round(size/10,0))
#nodes_excluded = int(round(size/10,0))
nodes_specific
=
3
00
#int(round(size/10,0))
nodes_specific
=
2
00
#int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
#nodes_generic = int(round(size/10,0))
# TODO user the included score for the node size
# TODO user the included score for the node size
...
@@ -267,6 +267,8 @@ def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=siz
...
@@ -267,6 +267,8 @@ def get_cooc(request=None, corpus=None, cooc_id=None, type='node_link', size=siz
G
.
remove_nodes_from
(
nodes_to_remove
)
G
.
remove_nodes_from
(
nodes_to_remove
)
uG
=
G
.
to_undirected
()
uG
=
G
.
to_undirected
()
partition
=
best_partition
(
uG
)
partition
=
best_partition
(
uG
)
print
(
"Density of the graph:"
,
nx
.
density
(
G
))
except
:
except
:
print
(
"-"
*
30
)
print
(
"-"
*
30
)
PrintException
()
PrintException
()
...
...
gargantext_web/celery.py
View file @
082984a9
...
@@ -2,13 +2,15 @@
...
@@ -2,13 +2,15 @@
from
celery
import
shared_task
from
celery
import
shared_task
from
node
import
models
from
node
import
models
from
django.db
import
transaction
import
cProfile
import
cProfile
#@app.task(bind=True)
#@app.task(bind=True)
@
shared_task
@
shared_task
def
debug_task
(
request
):
def
debug_task
(
request
):
print
(
'Request: {0!r}'
.
format
(
request
))
print
(
'Request: {0!r}'
.
format
(
request
))
from
gargantext_web.db
import
session
,
Node
from
gargantext_web.db
import
session
,
cache
,
Node
from
ngram.workflow
import
ngram_workflow
from
ngram.workflow
import
ngram_workflow
...
@@ -48,3 +50,18 @@ def apply_workflow(corpus_id):
...
@@ -48,3 +50,18 @@ def apply_workflow(corpus_id):
#ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
#ngrams2miam(user_id=corpus.user_id, corpus_id=corpus_id)
update_processing
(
corpus
,
0
)
update_processing
(
corpus
,
0
)
#@transaction.commit_manually
@
shared_task
def
empty_trash
(
corpus_id
):
nodes
=
models
.
Node
.
objects
.
filter
(
type_id
=
cache
.
NodeType
[
'Trash'
]
.
id
)
.
all
()
with
transaction
.
atomic
():
for
node
in
nodes
:
try
:
node
.
children
.
delete
()
except
Exception
as
error
:
print
(
error
)
node
.
delete
()
print
(
"Nodes deleted"
)
gargantext_web/views.py
View file @
082984a9
...
@@ -29,6 +29,7 @@ import json
...
@@ -29,6 +29,7 @@ import json
# SOME FUNCTIONS
# SOME FUNCTIONS
from
gargantext_web
import
settings
from
gargantext_web
import
settings
from
gargantext_web.settings
import
DEBUG
from
django.http
import
*
from
django.http
import
*
from
django.shortcuts
import
render_to_response
,
redirect
from
django.shortcuts
import
render_to_response
,
redirect
...
@@ -43,8 +44,9 @@ from gargantext_web.db import *
...
@@ -43,8 +44,9 @@ from gargantext_web.db import *
from
sqlalchemy
import
or_
,
func
from
sqlalchemy
import
or_
,
func
from
gargantext_web
import
about
from
gargantext_web
import
about
from
gargantext_web.celery
import
empty_trash
from
gargantext_web.db
import
NodeNgram
,
NodeNgramNgram
from
gargantext_web.db
import
cache
,
NodeNgram
,
NodeNgramNgram
def
login_user
(
request
):
def
login_user
(
request
):
logout
(
request
)
logout
(
request
)
...
@@ -416,17 +418,6 @@ def newpaginatorJSON(request , corpus_id):
...
@@ -416,17 +418,6 @@ def newpaginatorJSON(request , corpus_id):
return
JsonHttpResponse
(
finaldict
)
return
JsonHttpResponse
(
finaldict
)
def
empty_trash
():
nodes
=
models
.
Node
.
objects
.
filter
(
type_id
=
cache
.
NodeType
[
'Trash'
]
.
id
)
.
all
()
with
transaction
.
atomic
():
for
node
in
nodes
:
try
:
node
.
children
.
delete
()
except
Exception
as
error
:
print
(
error
)
node
.
delete
()
def
move_to_trash
(
node_id
):
def
move_to_trash
(
node_id
):
try
:
try
:
node
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
node_id
)
.
first
()
node
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
node_id
)
.
first
()
...
@@ -436,9 +427,16 @@ def move_to_trash(node_id):
...
@@ -436,9 +427,16 @@ def move_to_trash(node_id):
session
.
add
(
node
)
session
.
add
(
node
)
session
.
commit
()
session
.
commit
()
if
DEBUG
is
False
:
# TODO for the future maybe add id of node
empty_trash
.
apply_async
(
"corpus_id"
)
else
:
empty_trash
(
"corpus_id"
)
return
(
previous_type_id
)
return
(
previous_type_id
)
except
Exception
as
error
:
except
Exception
as
error
:
print
(
"can not move to trash Node"
+
node_id
+
":"
+
error
)
print
(
"can not move to trash Node"
+
str
(
node_id
)
+
":"
+
str
(
error
)
)
def
move_to_trash_multiple
(
request
):
def
move_to_trash_multiple
(
request
):
user
=
request
.
user
user
=
request
.
user
...
@@ -521,6 +519,24 @@ def chart(request, project_id, corpus_id):
...
@@ -521,6 +519,24 @@ def chart(request, project_id, corpus_id):
}))
}))
return
HttpResponse
(
html
)
return
HttpResponse
(
html
)
def
sankey
(
request
,
corpus_id
):
t
=
get_template
(
'sankey.html'
)
user
=
request
.
user
date
=
datetime
.
datetime
.
now
()
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
html
=
t
.
render
(
Context
({
\
'debug'
:
settings
.
DEBUG
,
'user'
:
user
,
\
'date'
:
date
,
\
'corpus'
:
corpus
,
\
}))
return
HttpResponse
(
html
)
def
matrix
(
request
,
project_id
,
corpus_id
):
def
matrix
(
request
,
project_id
,
corpus_id
):
t
=
get_template
(
'matrix.html'
)
t
=
get_template
(
'matrix.html'
)
user
=
request
.
user
user
=
request
.
user
...
@@ -539,7 +555,7 @@ def matrix(request, project_id, corpus_id):
...
@@ -539,7 +555,7 @@ def matrix(request, project_id, corpus_id):
return
HttpResponse
(
html
)
return
HttpResponse
(
html
)
def
graph
(
request
,
project_id
,
corpus_id
):
def
graph
(
request
,
project_id
,
corpus_id
,
generic
=
100
,
specific
=
100
):
t
=
get_template
(
'explorer.html'
)
t
=
get_template
(
'explorer.html'
)
user
=
request
.
user
user
=
request
.
user
date
=
datetime
.
datetime
.
now
()
date
=
datetime
.
datetime
.
now
()
...
@@ -569,6 +585,8 @@ def graph(request, project_id, corpus_id):
...
@@ -569,6 +585,8 @@ def graph(request, project_id, corpus_id):
# import pprint
# import pprint
# pprint.pprint(results)
# pprint.pprint(results)
# if specific != None and generic != None :
graphurl
=
"corpus/"
+
str
(
corpus_id
)
+
"/node_link.json"
graphurl
=
"corpus/"
+
str
(
corpus_id
)
+
"/node_link.json"
html
=
t
.
render
(
Context
({
\
html
=
t
.
render
(
Context
({
\
'debug'
:
settings
.
DEBUG
,
'debug'
:
settings
.
DEBUG
,
...
@@ -684,7 +702,7 @@ def send_csv(request, corpus_id):
...
@@ -684,7 +702,7 @@ def send_csv(request, corpus_id):
return
response
return
response
# To get the data
# To get the data
from
rest_v1_0.api
import
JsonHttpResponse
from
rest_v1_0.api
import
JsonHttpResponse
,
CsvHttpResponse
from
analysis.functions
import
get_cooc
from
analysis.functions
import
get_cooc
def
node_link
(
request
,
corpus_id
):
def
node_link
(
request
,
corpus_id
):
'''
'''
...
@@ -692,18 +710,22 @@ def node_link(request, corpus_id):
...
@@ -692,18 +710,22 @@ def node_link(request, corpus_id):
'''
'''
data
=
[]
data
=
[]
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
# filename = settings.MEDIA_ROOT + '/corpora/%s/%s_%s.json' % (request.user , corpus.parent_id, corpus_id)
# print("file exists?:",os.path.isfile(filename))
# if os.path.isfile(filename):
# json_data = open(filename,"r")
# data = json.load(json_data)
# json_data.close()
# else:
data
=
get_cooc
(
request
=
request
,
corpus
=
corpus
,
type
=
"node_link"
)
data
=
get_cooc
(
request
=
request
,
corpus
=
corpus
,
type
=
"node_link"
)
return
JsonHttpResponse
(
data
)
return
JsonHttpResponse
(
data
)
def
sankey_csv
(
request
,
corpus_id
):
data
=
[]
corpus
=
session
.
query
(
Node
)
.
filter
(
Node
.
id
==
corpus_id
)
.
first
()
data
=
[
[
"source"
,
"target"
,
"value"
]
,
[
"Elvis_1"
,
"Elvis_2"
,
1
]
,
[
"Elvis_2"
,
"Elvis_3"
,
2
]
,
[
"Barry"
,
"Elvis_3"
,
2
]
]
return
(
CsvHttpResponse
(
data
))
def
adjacency
(
request
,
corpus_id
):
def
adjacency
(
request
,
corpus_id
):
'''
'''
Create the HttpResponse object with the adjacency dataset.
Create the HttpResponse object with the adjacency dataset.
...
...
ngram/cvalue.py
View file @
082984a9
...
@@ -36,7 +36,7 @@ ngrams = {'adenoic cystic basal cell carcinoma' : 5
...
@@ -36,7 +36,7 @@ ngrams = {'adenoic cystic basal cell carcinoma' : 5
}
}
'''
'''
def
getNgrams
(
corpus
=
None
,
limit
=
1
6
0
):
def
getNgrams
(
corpus
=
None
,
limit
=
1
00
0
):
'''
'''
getNgrams :: Corpus -> [(Int, String, String, Float)]
getNgrams :: Corpus -> [(Int, String, String, Float)]
'''
'''
...
@@ -63,7 +63,7 @@ def getNgrams(corpus=None, limit=160):
...
@@ -63,7 +63,7 @@ def getNgrams(corpus=None, limit=160):
PrintException
()
PrintException
()
return
(
terms
)
return
(
terms
)
def
compute_cvalue
(
corpus
=
None
,
limit
=
1
6
0
):
def
compute_cvalue
(
corpus
=
None
,
limit
=
1
00
0
):
'''
'''
computeCvalue :: Corpus
computeCvalue :: Corpus
frequency :: String -> Int -> Int
frequency :: String -> Int -> Int
...
...
ngram/group.py
View file @
082984a9
...
@@ -137,9 +137,10 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
...
@@ -137,9 +137,10 @@ def compute_groups(corpus, limit_inf=None, limit_sup=None, how='Stem'):
miam_to_insert
.
add
((
miam_node
.
id
,
n
[
0
],
1
))
miam_to_insert
.
add
((
miam_node
.
id
,
n
[
0
],
1
))
#print([n for n in group])
#print([n for n in group])
for
g
in
group
:
for
g
in
group
:
if
(
miam_node
.
id
,
g
[
0
],
1
)
not
in
miam_to_insert
:
#list_to_check.remove(g)
#list_to_check.remove(g)
group_to_insert
.
append
((
node_group
.
id
,
n
[
0
],
g
[
0
],
1
))
group_to_insert
.
append
((
node_group
.
id
,
n
[
0
],
g
[
0
],
1
))
print
(
n
[
1
],
"="
,
g
[
1
])
print
(
n
[
1
],
"="
,
g
[
1
])
# Deleting previous groups
# Deleting previous groups
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_group
.
id
)
.
delete
()
session
.
query
(
NodeNgramNgram
)
.
filter
(
NodeNgramNgram
.
node_id
==
node_group
.
id
)
.
delete
()
...
...
ngram/tfidf.py
View file @
082984a9
...
@@ -127,7 +127,6 @@ def compute_tfidf_global(corpus):
...
@@ -127,7 +127,6 @@ def compute_tfidf_global(corpus):
tfidf_node
=
get_or_create_node
(
nodetype
=
'Tfidf (global)'
,
corpus
=
corpus
)
tfidf_node
=
get_or_create_node
(
nodetype
=
'Tfidf (global)'
,
corpus
=
corpus
)
# compute terms frequency sum
# compute terms frequency sum
db
,
cursor
=
get_cursor
()
db
,
cursor
=
get_cursor
()
...
@@ -240,7 +239,7 @@ def compute_tfidf_global(corpus):
...
@@ -240,7 +239,7 @@ def compute_tfidf_global(corpus):
lnD
=
log
(
D
)
lnD
=
log
(
D
)
cursor
.
execute
(
'UPDATE tmp__idf SET idf = idf +
%
f'
%
(
lnD
,
))
cursor
.
execute
(
'UPDATE tmp__idf SET idf = idf +
%
f'
%
(
lnD
,
))
# show off
# show off
dbg
.
show
(
'insert tfidf for
%
d documents'
%
D
)
dbg
.
show
(
'insert tfidf for
%
d documents'
%
(
D
,
)
)
cursor
.
execute
(
'''
cursor
.
execute
(
'''
INSERT INTO
INSERT INTO
%
s (nodex_id, nodey_id, ngram_id, score)
%
s (nodex_id, nodey_id, ngram_id, score)
...
...
ngram/workflow.py
View file @
082984a9
...
@@ -7,17 +7,28 @@ from ngram.group import compute_groups
...
@@ -7,17 +7,28 @@ from ngram.group import compute_groups
from
ngram.miam
import
compute_miam
from
ngram.miam
import
compute_miam
from
gargantext_web.db
import
get_or_create_node
from
gargantext_web.db
import
get_or_create_node
def
ngram_workflow
(
corpus
):
def
ngram_workflow
(
corpus
,
n
=
5000
):
'''
'''
All the workflow to filter the ngrams.
All the workflow to filter the ngrams.
'''
'''
compute_tfidf
(
corpus
)
compute_tfidf_global
(
corpus
)
compute_tfidf_global
(
corpus
)
compute_cvalue
(
corpus
,
limit
=
10000
)
# size
compute_specificity
(
corpus
,
limit
=
10000
)
part
=
round
(
n
*
0.8
)
compute_cvalue
(
corpus
,
limit
=
part
)
# size
part
=
round
(
part
*
0.6
)
compute_specificity
(
corpus
,
limit
=
part
)
part
=
round
(
part
*
0.5
)
# compute_stop(corpus)
# compute_stop(corpus)
compute_groups
(
corpus
,
limit_inf
=
1000
,
limit_sup
=
5000
)
compute_groups
(
corpus
,
limit_inf
=
part
,
limit_sup
=
n
)
compute_miam
(
corpus
,
limit
=
3000
)
# size
# compute_miam(corpus,limit=part) # size
compute_tfidf
(
corpus
)
#corpus=session.query(Node).filter(Node.id==244250).first()
#corpus=session.query(Node).filter(Node.id==244250).first()
#ngram_workflow(corpus)
#ngram_workflow(corpus)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment