Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
2d1a9b89
Commit
2d1a9b89
authored
Mar 02, 2015
by
PkSM3
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[UPDATE] newuser-nirvana: workflow__MOV + graph generation (writing ngrams in DB not included)
parent
b7edf98a
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
52 additions
and
79 deletions
+52
-79
functions.py
analysis/functions.py
+0
-21
views.py
gargantext_web/views.py
+17
-11
models.py
node/models.py
+24
-37
views.py
scrap_pubmed/views.py
+7
-3
project.html
templates/project.html
+4
-7
No files found.
analysis/functions.py
View file @
2d1a9b89
...
@@ -245,27 +245,6 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
...
@@ -245,27 +245,6 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
return
data
return
data
#def tfidf(corpus, document, ngram):
# '''
# Compute TF-IDF (Term Frequency - Inverse Document Frequency)
# See: http://en.wikipedia.org/wiki/Tf%E2%80%93idf
# '''
# try:
# occurences_of_ngram = Node_Ngram.objects.get(node=document, ngram=ngram).weight
# ngrams_by_document = sum([ x.weight for x in Node_Ngram.objects.filter(node=document)])
# term_frequency = occurences_of_ngram / ngrams_by_document
#
# xx = Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")).count()
# yy = Node_Ngram.objects.filter(ngram=ngram).count() # filter: ON node.parent=corpus
# inverse_document_frequency= log(xx/yy)
#
# # result = tf * idf
# result = term_frequency * inverse_document_frequency
# except Exception as error:
# print(error, ngram)
# result = 0
# return result
from
analysis.tfidf
import
tfidf
from
analysis.tfidf
import
tfidf
def
do_tfidf
(
corpus
,
reset
=
True
):
def
do_tfidf
(
corpus
,
reset
=
True
):
...
...
gargantext_web/views.py
View file @
2d1a9b89
...
@@ -25,6 +25,7 @@ from django import forms
...
@@ -25,6 +25,7 @@ from django import forms
from
collections
import
defaultdict
from
collections
import
defaultdict
from
parsing.FileParsers
import
*
from
parsing.FileParsers
import
*
import
os
# SOME FUNCTIONS
# SOME FUNCTIONS
...
@@ -282,9 +283,8 @@ def project(request, project_id):
...
@@ -282,9 +283,8 @@ def project(request, project_id):
cooclists
=
""
#.children.filter(type=type_cooclist)
cooclists
=
""
#.children.filter(type=type_cooclist)
for
corpus
in
corpora
:
for
corpus
in
corpora
:
# print("corpus", corpus.pk , corpus.name , corpus.type_id)
docs_count
=
Node
.
objects
.
filter
(
parent
=
corpus
,
type
=
type_document
)
.
count
()
docs_count
=
Node
.
objects
.
filter
(
parent
=
corpus
,
type
=
type_document
)
.
count
()
# print("corpus:", corpus.pk , " | name:",corpus.name , " | type:",corpus.type_id , " | #docs:",docs_count)
docs_total
+=
docs_count
docs_total
+=
docs_count
corpus_view
=
dict
()
corpus_view
=
dict
()
...
@@ -727,6 +727,7 @@ def graph(request, project_id, corpus_id):
...
@@ -727,6 +727,7 @@ def graph(request, project_id, corpus_id):
'date'
:
date
,
\
'date'
:
date
,
\
'corpus'
:
corpus
,
\
'corpus'
:
corpus
,
\
'project'
:
project
,
\
'project'
:
project
,
\
'graphfile'
:
"hola_mundo"
,
\
}))
}))
return
HttpResponse
(
html
)
return
HttpResponse
(
html
)
...
@@ -839,18 +840,23 @@ def send_csv(request, corpus_id):
...
@@ -839,18 +840,23 @@ def send_csv(request, corpus_id):
from
gargantext_web.api
import
JsonHttpResponse
from
gargantext_web.api
import
JsonHttpResponse
from
analysis.functions
import
get_cooc
from
analysis.functions
import
get_cooc
import
json
import
json
from
gargantext_web.settings
import
MEDIA_ROOT
def
node_link
(
request
,
corpus_id
):
def
node_link
(
request
,
corpus_id
):
'''
'''
Create the HttpResponse object with the node_link dataset.
Create the HttpResponse object with the node_link dataset.
'''
'''
import
time
print
(
"In node_link() START"
)
data
=
[]
start
=
time
.
time
()
data
=
get_cooc
(
request
=
request
,
corpus_id
=
corpus_id
,
type
=
"node_link"
)
corpus
=
Node
.
objects
.
get
(
id
=
corpus_id
)
end
=
time
.
time
()
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s_
%
s.json'
%
(
request
.
user
,
corpus
.
parent
.
id
,
corpus_id
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" get_cooc() [s]"
,(
end
-
start
))
print
(
"file exists?:"
,
os
.
path
.
isfile
(
filename
))
print
(
"In node_link() END"
)
if
os
.
path
.
isfile
(
filename
):
json_data
=
open
(
filename
,
"r"
)
data
=
json
.
load
(
json_data
)
json_data
.
close
()
else
:
data
=
get_cooc
(
request
=
request
,
corpus_id
=
corpus_id
,
type
=
"node_link"
)
return
JsonHttpResponse
(
data
)
return
JsonHttpResponse
(
data
)
def
adjacency
(
request
,
corpus_id
):
def
adjacency
(
request
,
corpus_id
):
...
...
node/models.py
View file @
2d1a9b89
...
@@ -287,16 +287,6 @@ class Node(CTENode):
...
@@ -287,16 +287,6 @@ class Node(CTENode):
for
p
in
proc
:
for
p
in
proc
:
p
.
join
()
p
.
join
()
def
pushScore
(
self
,
FINAL
,
n1
,
n2
,
score
):
if
not
FINAL
.
has_key
(
n1
):
FINAL
[
n1
]
=
[]
FINAL
[
n1
]
.
append
(
score
)
if
not
FINAL
.
has_key
(
n2
):
FINAL
[
n2
]
=
[]
FINAL
[
n2
]
.
append
(
score
)
def
parse_resources__MOV
(
self
,
verbose
=
False
):
def
parse_resources__MOV
(
self
,
verbose
=
False
):
# parse all resources into a list of metadata
# parse all resources into a list of metadata
metadata_list
=
[]
metadata_list
=
[]
...
@@ -436,10 +426,12 @@ class Node(CTENode):
...
@@ -436,10 +426,12 @@ class Node(CTENode):
docID
=
i
[
0
]
docID
=
i
[
0
]
associations
=
i
[
1
]
associations
=
i
[
1
]
# [ considering just {2,3}-grams ]
termsCount
=
0
termsCount
=
0
for
ngram_text
,
weight
in
associations
.
items
():
for
ngram_text
,
weight
in
associations
.
items
():
if
ngram_text
in
NGram2ID
:
# considering just {2,3}-grams
if
ngram_text
in
NGram2ID
:
# considering just {2,3}-grams
termsCount
+=
1
termsCount
+=
1
# [ / considering just {2,3}-grams ]
ngrams_by_document
=
termsCount
# i re-calculed this because of *02*
ngrams_by_document
=
termsCount
# i re-calculed this because of *02*
terms
=
[]
terms
=
[]
...
@@ -562,12 +554,12 @@ class Node(CTENode):
...
@@ -562,12 +554,12 @@ class Node(CTENode):
total
+=
(
end
-
start
)
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" parse_resources()__MOV [s]"
,(
end
-
start
))
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" parse_resources()__MOV [s]"
,(
end
-
start
))
#
print("LOG::TIME: In workflow() writeMetadata__MOV()")
print
(
"LOG::TIME: In workflow() writeMetadata__MOV()"
)
#
start = time.time()
start
=
time
.
time
()
#
self.writeMetadata__MOV( metadata_list=theMetadata )
self
.
writeMetadata__MOV
(
metadata_list
=
theMetadata
)
#
end = time.time()
end
=
time
.
time
()
#
total += (end - start)
total
+=
(
end
-
start
)
#
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" writeMetadata__MOV() [s]",(end - start))
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" writeMetadata__MOV() [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() extract_ngrams__MOV()"
)
print
(
"LOG::TIME: In workflow() extract_ngrams__MOV()"
)
...
@@ -585,33 +577,28 @@ class Node(CTENode):
...
@@ -585,33 +577,28 @@ class Node(CTENode):
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" do_tfidf() [s]"
,(
end
-
start
))
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" do_tfidf() [s]"
,(
end
-
start
))
# # print("LOG::TIME: In workflow() / do_tfidf()")
# # print("LOG::TIME: In workflow() / do_tfidf()")
start
=
time
.
time
()
# print("\n= = = = = = = = = = = = = = = =")
print
(
"LOG::TIME: In workflow() do_coocmatrix()"
)
# print("NUMBER OF NGRAMS:",len(resultDict["G"]))
# # M = resultDict["metrics"]
# # Metrics2 = sorted(M, key=lambda x: M[x]['C'])
# # for i in Metrics2:
# # print("as: ",i,":",M[i])
# print("= = = = = = = = = = = = = = = =\n")
jsongraph
=
self
.
do_coocmatrix__MOV
(
resultDict
[
"TERMS"
]
,
resultDict
[
"G"
]
,
n
=
150
)
jsongraph
=
self
.
do_coocmatrix__MOV
(
resultDict
[
"TERMS"
]
,
resultDict
[
"G"
]
,
n
=
150
)
end
=
time
.
time
()
import
pprint
total
+=
(
end
-
start
)
pprint
.
pprint
(
jsongraph
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" do_coocmatrix() [s]"
,(
end
-
start
))
print
(
"the user:"
,
self
.
user
)
print
(
"the project id:"
,
self
.
parent
.
id
)
print
(
"the corpus id:"
,
self
.
id
)
# timestamp = str(datetime.datetime.now().isoformat())
# # filename = MEDIA_ROOT + '/corpora/%s/%s_%s__%s.json' % (self.user , self.parent.id, self.id , timestamp)
filename
=
MEDIA_ROOT
+
'/corpora/
%
s/
%
s_
%
s.json'
%
(
self
.
user
,
self
.
parent
.
id
,
self
.
id
)
import
json
f
=
open
(
filename
,
"w"
)
f
.
write
(
json
.
dumps
(
jsongraph
)
)
f
.
close
()
# # # this is not working
# # # this is not working
# # self.runInParallel( self.writeMetadata__MOV( metadata_list=theMetadata ) , self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) )
# # self.runInParallel( self.writeMetadata__MOV( metadata_list=theMetadata ) , self.extract_ngrams__MOV(theMetadata , keys=['title','abstract',] ) )
# start = time.time()
# print("LOG::TIME: In workflow() do_tfidf()")
# from analysis.functions import do_tfidf
# do_tfidf(self)
# end = time.time()
# total += (end - start)
# print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_tfidf() [s]",(end - start))
# # # print("LOG::TIME: In workflow() / do_tfidf()")
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" In workflow() END"
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" In workflow() END"
)
...
...
scrap_pubmed/views.py
View file @
2d1a9b89
...
@@ -132,10 +132,14 @@ def doTheQuery(request , project_id):
...
@@ -132,10 +132,14 @@ def doTheQuery(request , project_id):
# do the WorkFlow
# do the WorkFlow
try
:
try
:
if
DEBUG
is
True
:
if
DEBUG
is
True
:
corpus
.
workflow
()
# corpus.workflow() # old times...
# corpus.workflow__MOV()
corpus
.
workflow__MOV
()
# corpus.write_everything_to_DB()
else
:
else
:
corpus
.
workflow
.
apply_async
((),
countdown
=
3
)
# corpus.workflow.apply_async((), countdown=3)
corpus
.
workflow__MOV
()
# synchronous! because is faaast
# corpus.write_everything_to_DB.apply_async((), countdown=3) # asynchronous
return
JsonHttpResponse
([
"workflow"
,
"finished"
])
return
JsonHttpResponse
([
"workflow"
,
"finished"
])
except
Exception
as
error
:
except
Exception
as
error
:
...
...
templates/project.html
View file @
2d1a9b89
...
@@ -84,13 +84,10 @@
...
@@ -84,13 +84,10 @@
<ul>
<ul>
{% for corpus in corpora %}
{% for corpus in corpora %}
<li>
{% ifnotequal corpus.count 0 %}
<li>
{% ifnotequal corpus.count 0 %}
<a
href=
"/project/{{project.id}}/corpus/{{corpus.id}}"
>
<a
href=
"/project/{{project.id}}/corpus/{{corpus.id}}"
>
{{corpus.name}}
</a>
, {{ corpus.count }} Documents
{{corpus.name}}
{% else %}
</a>
{{corpus.name}} :
<img
width=
"20px"
src=
"{% static "
js
/
libs
/
img2
/
loading-bar
.
gif
"
%}"
></img>
Processing, drink a cup of tea, and refresh the page :)
, {{ corpus.count }} Documents
{% endifnotequal %}
{% else %}
{{corpus.name}} :
<img
width=
"20px"
src=
"{% static "
js
/
libs
/
img2
/
loading-bar
.
gif
"
%}"
></img>
Processing, drink a cup of tea, and refresh the page :)
{% endifnotequal %}
<button
type=
"button"
class=
"btn btn-xs btn-default"
data-container=
"body"
data-toggle=
"popover"
data-placement=
"bottom"
<button
type=
"button"
class=
"btn btn-xs btn-default"
data-container=
"body"
data-toggle=
"popover"
data-placement=
"bottom"
data-content=
'
data-content=
'
<ul>
<ul>
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment