Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
gargantext
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
humanities
gargantext
Commits
03ac1095
Commit
03ac1095
authored
Mar 04, 2015
by
PkSM3
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
[UPDATE] pushing for the big merge
parent
2d1a9b89
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
50 additions
and
70 deletions
+50
-70
functions.py
analysis/functions.py
+1
-6
views.py
gargantext_web/views.py
+1
-21
admin.py
node/admin.py
+0
-2
models.py
node/models.py
+39
-29
PubmedFileParser.py
parsing/FileParsers/PubmedFileParser.py
+1
-1
views.py
scrap_pubmed/views.py
+5
-9
explorer.html
templates/explorer.html
+3
-2
No files found.
analysis/functions.py
View file @
03ac1095
...
...
@@ -160,14 +160,12 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
if
Node
.
objects
.
filter
(
type
=
type_cooc
,
parent
=
corpus
)
.
first
()
is
None
:
print
(
"Coocurrences do not exist yet, create it."
)
whitelist
=
create_whitelist
(
request
.
user
,
corpus
,
size
=
n
)
print
(
"PRINTING WHITELIST:"
,
whitelist
)
cooccurrence_node
=
create_cooc
(
user
=
request
.
user
,
corpus
=
corpus
,
whitelist
=
whitelist
,
size
=
n
)
print
(
cooccurrence_node
.
id
,
"Cooc created"
)
else
:
cooccurrence_node
=
Node
.
objects
.
filter
(
type
=
type_cooc
,
parent
=
corpus
)
.
first
()
for
cooccurrence
in
NodeNgramNgram
.
objects
.
filter
(
node
=
cooccurrence_node
):
# print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"
:
",cooccurrence.score)
# print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"
\t
",cooccurrence.score)
ids
[
cooccurrence
.
ngramx
.
terms
]
=
cooccurrence
.
ngramx
.
id
ids
[
cooccurrence
.
ngramy
.
terms
]
=
cooccurrence
.
ngramy
.
id
...
...
@@ -180,8 +178,6 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
weight
[
cooccurrence
.
ngramy
.
terms
]
=
weight
.
get
(
cooccurrence
.
ngramy
.
terms
,
0
)
+
cooccurrence
.
score
weight
[
cooccurrence
.
ngramx
.
terms
]
=
weight
.
get
(
cooccurrence
.
ngramx
.
terms
,
0
)
+
cooccurrence
.
score
print
(
"
\n
===================
\n
NUMBER OF NGRAMS_2:"
,
len
(
weight
.
keys
()))
df
=
pd
.
DataFrame
(
matrix
)
.
fillna
(
0
)
x
=
copy
(
df
.
values
)
x
=
x
/
x
.
sum
(
axis
=
1
)
...
...
@@ -194,7 +190,6 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
G
=
nx
.
from_numpy_matrix
(
matrix_filtered
)
G
=
nx
.
relabel_nodes
(
G
,
dict
(
enumerate
([
labels
[
label
]
for
label
in
list
(
df
.
columns
)])))
#G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
print
(
"NUMBER OF NODES_2"
,
len
(
G
))
# Removing too connected nodes (find automatic way to do it)
# outdeg = G.degree()
# to_remove = [n for n in outdeg if outdeg[n] >= 10]
...
...
gargantext_web/views.py
View file @
03ac1095
...
...
@@ -312,15 +312,12 @@ def project(request, project_id):
if
docs_total
==
0
or
docs_total
is
None
:
docs_total
=
1
# The donut will show: percentage by
donut
=
[
{
'source'
:
key
,
'count'
:
donut_part
[
key
]
,
'part'
:
round
(
donut_part
[
key
]
*
100
/
docs_total
)
}
\
for
key
in
donut_part
.
keys
()
]
dauser
=
User
.
objects
.
get
(
username
=
user
)
groups
=
len
(
dauser
.
groups
.
filter
(
name
=
"PubMed_0.1"
))
print
(
"*groupslen*:"
,
groups
)
...
...
@@ -330,8 +327,6 @@ def project(request, project_id):
form
=
CustomForm
(
request
.
POST
,
request
.
FILES
)
if
form
.
is_valid
():
name
=
form
.
cleaned_data
[
'name'
]
thefile
=
form
.
cleaned_data
[
'file'
]
resource_type
=
ResourceType
.
objects
.
get
(
name
=
str
(
form
.
cleaned_data
[
'type'
]
))
...
...
@@ -364,9 +359,7 @@ def project(request, project_id):
type
=
node_type
,
name
=
name
,
)
corpus
.
save
()
corpus
.
add_resource
(
user
=
request
.
user
,
type
=
resource_type
,
...
...
@@ -386,13 +379,11 @@ def project(request, project_id):
return
HttpResponseRedirect
(
'/project/'
+
str
(
project_id
))
except
Exception
as
error
:
print
(
'ee'
,
error
)
form
=
CorpusForm
(
request
=
request
)
formResource
=
ResourceForm
()
else
:
print
(
"bad form, bad form"
)
return
render
(
request
,
'project.html'
,
{
...
...
@@ -409,8 +400,7 @@ def project(request, project_id):
})
else
:
form
=
CustomForm
()
return
render
(
request
,
'project.html'
,
{
'form'
:
form
,
'user'
:
user
,
...
...
@@ -666,8 +656,6 @@ def subcorpusJSON(request, project_id, corpus_id, start , end ):
# return HttpResponse(html)
return
HttpResponse
(
serializer
.
data
,
content_type
=
'application/json'
)
def
delete_project
(
request
,
node_id
):
Node
.
objects
.
filter
(
id
=
node_id
)
.
all
()
.
delete
()
return
HttpResponseRedirect
(
'/projects/'
)
...
...
@@ -676,7 +664,6 @@ def delete_corpus(request, project_id, corpus_id):
Node
.
objects
.
filter
(
id
=
corpus_id
)
.
all
()
.
delete
()
return
HttpResponseRedirect
(
'/project/'
+
project_id
)
def
chart
(
request
,
project_id
,
corpus_id
):
''' Charts to compare, filter, count'''
if
MAINTENANCE
:
return
HttpResponseRedirect
(
'/maintenance/'
)
...
...
@@ -732,10 +719,6 @@ def graph(request, project_id, corpus_id):
return
HttpResponse
(
html
)
def
exploration
(
request
):
if
MAINTENANCE
:
return
HttpResponseRedirect
(
'/maintenance/'
)
t
=
get_template
(
'exploration.html'
)
...
...
@@ -793,8 +776,6 @@ def corpus_csv(request, project_id, corpus_id):
return
response
def
send_csv
(
request
,
corpus_id
):
'''
Create the HttpResponse object with the appropriate CSV header.
...
...
@@ -835,7 +816,6 @@ def send_csv(request, corpus_id):
return
response
# To get the data
from
gargantext_web.api
import
JsonHttpResponse
from
analysis.functions
import
get_cooc
...
...
node/admin.py
View file @
03ac1095
...
...
@@ -132,8 +132,6 @@ class CustomForm(forms.Form):
# raise forms.ValidationError(_('We need a zip pls.'))
return
file_
class
CorpusForm
(
ModelForm
):
#parent = ModelChoiceField(EmptyQuerySet)
def
__init__
(
self
,
*
args
,
**
kwargs
):
...
...
node/models.py
View file @
03ac1095
...
...
@@ -222,12 +222,14 @@ class Node(CTENode):
associations
=
defaultdict
(
float
)
# float or int?
if
isinstance
(
keys
,
dict
):
for
key
,
weight
in
keys
.
items
():
for
ngram
in
extractor
.
extract_ngrams
(
self
.
metadata
[
key
]):
text2process
=
str
(
self
.
metadata
[
key
])
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
for
ngram
in
extractor
.
extract_ngrams
(
text2process
):
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
associations
[
ngram
]
+=
weight
else
:
for
key
in
keys
:
for
ngram
in
extractor
.
extract_ngrams
(
self
.
metadata
[
key
]):
text2process
=
str
(
self
.
metadata
[
key
])
.
replace
(
'['
,
''
)
.
replace
(
']'
,
''
)
for
ngram
in
extractor
.
extract_ngrams
(
text2process
):
terms
=
' '
.
join
([
token
for
token
,
tag
in
ngram
])
associations
[
terms
]
+=
1
Node_Ngram
.
objects
.
bulk_create
([
...
...
@@ -318,18 +320,21 @@ class Node(CTENode):
language
=
langages_cache
[
metadata_values
[
'language_iso2'
]]
if
'language_iso2'
in
metadata_values
else
None
,
if
isinstance
(
language
,
tuple
):
language
=
language
[
0
]
Node
(
node
=
Node
(
user_id
=
user_id
,
type_id
=
type_id
,
name
=
name
,
parent
=
self
,
language_id
=
language
.
id
if
language
else
None
,
metadata
=
metadata_values
)
.
save
()
)
node
.
save
()
metadata_values
[
"id"
]
=
node
.
id
# # make metadata filterable
self
.
children
.
all
()
.
make_metadata_filterable
()
# # mark the resources as parsed for this node
self
.
node_resource
.
update
(
parsed
=
True
)
return
metadata_list
def
extract_ngrams__MOV
(
self
,
array
,
keys
,
ngramsextractorscache
=
None
,
ngramscaches
=
None
):
if
ngramsextractorscache
is
None
:
...
...
@@ -369,7 +374,7 @@ class Node(CTENode):
associations
[
terms
]
+=
1
if
(
len
(
associations
)
>
0
):
results
.
append
(
[
i
,
associations
]
)
results
.
append
(
[
metadata
[
"id"
]
,
associations
]
)
i
+=
1
return
results
...
...
@@ -421,7 +426,7 @@ class Node(CTENode):
ngramid
+=
1
# *03* [ / making dictionaries for NGram_Text <=> NGram_ID ]
docs_X_terms
=
{}
for
i
in
FreqList
:
# foreach ID in Doc:
docID
=
i
[
0
]
associations
=
i
[
1
]
...
...
@@ -435,9 +440,10 @@ class Node(CTENode):
ngrams_by_document
=
termsCount
# i re-calculed this because of *02*
terms
=
[]
terms_occ
=
[]
if
ngrams_by_document
>
0
:
for
ngram_text
,
weight
in
associations
.
items
():
if
ngram_text
in
NGram2ID
:
if
ngram_text
in
NGram2ID
:
terms
.
append
(
NGram2ID
[
ngram_text
])
# [ calculating TF-IDF ]
occurrences_of_ngram
=
weight
...
...
@@ -446,6 +452,9 @@ class Node(CTENode):
yy
=
FirstNgrams
[
ngram_text
][
"C"
]
inverse_document_frequency
=
log
(
xx
/
yy
)
#log base e
tfidfScore
=
term_frequency
*
inverse_document_frequency
terms_occ
.
append
(
[
NGram2ID
[
ngram_text
]
,
round
(
tfidfScore
,
3
)
]
)
# [ / calculating TF-IDF ]
if
"T"
in
FirstNgrams
[
ngram_text
]:
FirstNgrams
[
ngram_text
][
"T"
]
.
append
(
tfidfScore
)
...
...
@@ -453,9 +462,13 @@ class Node(CTENode):
FirstNgrams
[
ngram_text
][
"T"
]
=
[
tfidfScore
]
if
len
(
terms
)
>
1
:
docs_X_terms
[
docID
]
=
terms_occ
# print("docid:",docID)
# for i in terms:
# print("\t",ID2NGram[i])
calc
.
addCompleteSubGraph
(
terms
)
return
{
"G"
:
calc
.
G
,
"TERMS"
:
ID2NGram
,
"metrics"
:
FirstNgrams
}
return
{
"G"
:
calc
.
G
,
"TERMS"
:
ID2NGram
,
"
ii"
:
docs_X_terms
,
"
metrics"
:
FirstNgrams
}
def
do_coocmatrix__MOV
(
self
,
TERMS
,
G
,
n
=
150
,
type
=
'node_link'
):
import
pandas
as
pd
...
...
@@ -475,20 +488,19 @@ class Node(CTENode):
n1
=
e
[
0
]
n2
=
e
[
1
]
w
=
G
[
n1
][
n2
][
'weight'
]
# print("\t",n1," <=> ",n2, " : ", G[n1][n2]['weight'],"\t",TERMS[n1]," <=> ",TERMS[n2], " : ", G[n1][n2]['weight'])
# print(n1," <=> ",n2, " : ", G[n1][n2]['weight'],"\t",TERMS[n1]," <=> ",TERMS[n2], "\t", G[n1][n2]['weight'])
ids
[
TERMS
[
n1
]]
=
n1
ids
[
TERMS
[
n2
]]
=
n2
labels
[
n1
]
=
TERMS
[
n1
]
labels
[
n2
]
=
TERMS
[
n2
]
matrix
[
n1
][
n2
]
=
w
matrix
[
n2
][
n1
]
=
w
matrix
[
n1
][
n2
]
=
w
matrix
[
n2
][
n1
]
=
w
weight
[
n2
]
=
weight
.
get
(
n2
,
0
)
+
w
weight
[
n1
]
=
weight
.
get
(
n1
,
0
)
+
w
weight
[
TERMS
[
n2
]]
=
weight
.
get
(
TERMS
[
n2
],
0
)
+
w
weight
[
TERMS
[
n1
]]
=
weight
.
get
(
TERMS
[
n1
],
0
)
+
w
print
(
"
\n
===================
\n
NUMBER OF NGRAMS:"
,
len
(
weight
.
keys
()))
df
=
pd
.
DataFrame
(
matrix
)
.
fillna
(
0
)
x
=
copy
(
df
.
values
)
x
=
x
/
x
.
sum
(
axis
=
1
)
...
...
@@ -499,27 +511,23 @@ class Node(CTENode):
#matrix_filtered = np.where(x > threshold, x, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
G
=
nx
.
from_numpy_matrix
(
matrix_filtered
)
G
=
nx
.
relabel_nodes
(
G
,
dict
(
enumerate
([
labels
[
label
]
for
label
in
list
(
df
.
columns
)])))
print
(
"NUMBER OF NODES:"
,
len
(
G
))
#
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(df.columns)])))
partition
=
best_partition
(
G
)
data
=
[]
if
type
==
"node_link"
:
for
community
in
set
(
partition
.
values
()):
#print(community)
G
.
add_node
(
"cluster "
+
str
(
community
),
hidden
=
1
)
for
node
in
G
.
nodes
():
try
:
#node,type(labels[node])
G
.
node
[
node
][
'label'
]
=
node
G
.
node
[
node
][
'name'
]
=
node
G
.
node
[
node
][
'pk'
]
=
ids
[
str
(
node
)]
G
.
node
[
node
][
'label'
]
=
TERMS
[
node
]
G
.
node
[
node
][
'pk'
]
=
node
G
.
node
[
node
][
'size'
]
=
weight
[
node
]
G
.
node
[
node
][
'group'
]
=
partition
[
node
]
G
.
add_edge
(
node
,
"cluster "
+
str
(
partition
[
node
]),
weight
=
3
)
except
Exception
as
error
:
print
(
error
)
print
(
"IMA IN node_link CASE"
)
print
(
"ERROR:"
,
error
)
data
=
json_graph
.
node_link_data
(
G
)
elif
type
==
"adjacency"
:
...
...
@@ -533,10 +541,8 @@ class Node(CTENode):
#G.add_edge(node, partition[node], weight=3)
except
Exception
as
error
:
print
(
error
)
print
(
"IMA IN adjacency CASE"
)
data
=
json_graph
.
node_link_data
(
G
)
print
(
"* * * * FINISHED * * * *"
)
return
data
...
...
@@ -554,14 +560,14 @@ class Node(CTENode):
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" parse_resources()__MOV [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() writeMetadata__MOV()"
)
start
=
time
.
time
()
self
.
writeMetadata__MOV
(
metadata_list
=
theMetadata
)
theMetadata
=
self
.
writeMetadata__MOV
(
metadata_list
=
theMetadata
)
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" writeMetadata__MOV() [s]"
,(
end
-
start
))
print
(
"LOG::TIME: In workflow() extract_ngrams__MOV()"
)
start
=
time
.
time
()
FreqList
=
self
.
extract_ngrams__MOV
(
theMetadata
,
keys
=
[
'title'
]
)
...
...
@@ -580,10 +586,14 @@ class Node(CTENode):
start
=
time
.
time
()
print
(
"LOG::TIME: In workflow() do_coocmatrix()"
)
jsongraph
=
self
.
do_coocmatrix__MOV
(
resultDict
[
"TERMS"
]
,
resultDict
[
"G"
]
,
n
=
150
)
jsongraph
[
"stats"
]
=
resultDict
[
"ii"
]
end
=
time
.
time
()
total
+=
(
end
-
start
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" do_coocmatrix() [s]"
,(
end
-
start
))
# import pprint
# pprint.pprint(jsongraph)
print
(
"the user:"
,
self
.
user
)
print
(
"the project id:"
,
self
.
parent
.
id
)
print
(
"the corpus id:"
,
self
.
id
)
...
...
parsing/FileParsers/PubmedFileParser.py
View file @
03ac1095
...
...
@@ -25,7 +25,7 @@ class PubmedFileParser(FileParser):
metadata_path
=
{
"journal"
:
'MedlineCitation/Article/Journal/Title'
,
"title"
:
'MedlineCitation/Article/ArticleTitle'
,
"abstract"
:
'MedlineCitation/Article/Abstract/AbstractText'
,
#
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"title2"
:
'MedlineCitation/Article/VernacularTitle'
,
"language_iso3"
:
'MedlineCitation/Article/Language'
,
"doi"
:
'PubmedData/ArticleIdList/ArticleId[@type=doi]'
,
...
...
scrap_pubmed/views.py
View file @
03ac1095
...
...
@@ -31,7 +31,7 @@ def getGlobalStats(request ):
alist
=
[
"bar"
,
"foo"
]
if
request
.
method
==
"POST"
:
N
=
10
0
N
=
10
query
=
request
.
POST
[
"query"
]
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" query ="
,
query
)
print
(
"LOG::TIME:_ "
+
datetime
.
datetime
.
now
()
.
isoformat
()
+
" N ="
,
N
)
...
...
@@ -73,8 +73,6 @@ def doTheQuery(request , project_id):
alist
=
[
"hola"
,
"mundo"
]
if
request
.
method
==
"POST"
:
query
=
request
.
POST
[
"query"
]
name
=
request
.
POST
[
"string"
]
...
...
@@ -101,14 +99,12 @@ def doTheQuery(request , project_id):
type_id
=
NodeType
.
objects
.
get
(
name
=
'Document'
)
.
id
user_id
=
User
.
objects
.
get
(
username
=
request
.
user
)
.
id
corpus
=
Node
(
user
=
request
.
user
,
parent
=
parent
,
type
=
node_type
,
name
=
name
,
)
corpus
.
save
()
tasks
=
MedlineFetcher
()
...
...
@@ -132,12 +128,12 @@ def doTheQuery(request , project_id):
# do the WorkFlow
try
:
if
DEBUG
is
True
:
#
corpus.workflow() # old times...
corpus
.
workflow__MOV
()
corpus
.
workflow
()
# old times...
#
corpus.workflow__MOV()
# corpus.write_everything_to_DB()
else
:
#
corpus.workflow.apply_async((), countdown=3)
corpus
.
workflow__MOV
()
# synchronous! because is faaast
corpus
.
workflow
.
apply_async
((),
countdown
=
3
)
#
corpus.workflow__MOV() # synchronous! because is faaast
# corpus.write_everything_to_DB.apply_async((), countdown=3) # asynchronous
...
...
templates/explorer.html
View file @
03ac1095
...
...
@@ -260,8 +260,8 @@
</div>
<div
id=
"topPapers"
></div>
<!--
<div id="tab-container-top" class='tab-container'>
<ul class='etabs'>
...
...
@@ -278,6 +278,7 @@
</div>
</div>
</div>
-->
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment