Commit 03ac1095 authored by PkSM3's avatar PkSM3

[UPDATE] pushing for the big merge

parent 2d1a9b89
......@@ -160,14 +160,12 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
if Node.objects.filter(type=type_cooc, parent=corpus).first() is None:
print("Coocurrences do not exist yet, create it.")
whitelist = create_whitelist(request.user, corpus, size=n)
print("PRINTING WHITELIST:", whitelist)
cooccurrence_node = create_cooc(user=request.user, corpus=corpus, whitelist=whitelist, size=n)
print(cooccurrence_node.id, "Cooc created")
else:
cooccurrence_node = Node.objects.filter(type=type_cooc, parent=corpus).first()
for cooccurrence in NodeNgramNgram.objects.filter(node=cooccurrence_node):
# print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms," : ",cooccurrence.score)
# print(cooccurrence.ngramx.terms," <=> ",cooccurrence.ngramy.terms,"\t",cooccurrence.score)
ids[cooccurrence.ngramx.terms] = cooccurrence.ngramx.id
ids[cooccurrence.ngramy.terms] = cooccurrence.ngramy.id
......@@ -180,8 +178,6 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
weight[cooccurrence.ngramy.terms] = weight.get(cooccurrence.ngramy.terms, 0) + cooccurrence.score
weight[cooccurrence.ngramx.terms] = weight.get(cooccurrence.ngramx.terms, 0) + cooccurrence.score
print("\n===================\nNUMBER OF NGRAMS_2:",len(weight.keys()))
df = pd.DataFrame(matrix).fillna(0)
x = copy(df.values)
x = x / x.sum(axis=1)
......@@ -194,7 +190,6 @@ def get_cooc(request=None, corpus_id=None, cooc_id=None, type='node_link', n=150
G = nx.from_numpy_matrix(matrix_filtered)
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(df.columns)])))
#G = nx.relabel_nodes(G, dict(enumerate(df.columns)))
print("NUMBER OF NODES_2",len(G))
# Removing too connected nodes (find automatic way to do it)
# outdeg = G.degree()
# to_remove = [n for n in outdeg if outdeg[n] >= 10]
......
......@@ -312,15 +312,12 @@ def project(request, project_id):
if docs_total == 0 or docs_total is None:
docs_total = 1
# The donut will show: percentage by
donut = [ {'source': key,
'count': donut_part[key] ,
'part' : round(donut_part[key] * 100 / docs_total) } \
for key in donut_part.keys() ]
dauser = User.objects.get( username=user )
groups = len(dauser.groups.filter(name="PubMed_0.1"))
print("*groupslen*:",groups)
......@@ -330,8 +327,6 @@ def project(request, project_id):
form = CustomForm(request.POST, request.FILES)
if form.is_valid():
name = form.cleaned_data['name']
thefile = form.cleaned_data['file']
resource_type = ResourceType.objects.get(name=str( form.cleaned_data['type'] ))
......@@ -364,9 +359,7 @@ def project(request, project_id):
type=node_type,
name=name,
)
corpus.save()
corpus.add_resource(
user=request.user,
type=resource_type,
......@@ -386,13 +379,11 @@ def project(request, project_id):
return HttpResponseRedirect('/project/' + str(project_id))
except Exception as error:
print('ee', error)
form = CorpusForm(request=request)
formResource = ResourceForm()
else:
print("bad form, bad form")
return render(request, 'project.html', {
......@@ -409,8 +400,7 @@ def project(request, project_id):
})
else:
form = CustomForm()
return render(request, 'project.html', {
'form' : form,
'user' : user,
......@@ -666,8 +656,6 @@ def subcorpusJSON(request, project_id, corpus_id, start , end ):
# return HttpResponse(html)
return HttpResponse( serializer.data , content_type='application/json')
def delete_project(request, node_id):
Node.objects.filter(id=node_id).all().delete()
return HttpResponseRedirect('/projects/')
......@@ -676,7 +664,6 @@ def delete_corpus(request, project_id, corpus_id):
Node.objects.filter(id=corpus_id).all().delete()
return HttpResponseRedirect('/project/' + project_id)
def chart(request, project_id, corpus_id):
''' Charts to compare, filter, count'''
if MAINTENANCE: return HttpResponseRedirect('/maintenance/')
......@@ -732,10 +719,6 @@ def graph(request, project_id, corpus_id):
return HttpResponse(html)
def exploration(request):
if MAINTENANCE: return HttpResponseRedirect('/maintenance/')
t = get_template('exploration.html')
......@@ -793,8 +776,6 @@ def corpus_csv(request, project_id, corpus_id):
return response
def send_csv(request, corpus_id):
'''
Create the HttpResponse object with the appropriate CSV header.
......@@ -835,7 +816,6 @@ def send_csv(request, corpus_id):
return response
# To get the data
from gargantext_web.api import JsonHttpResponse
from analysis.functions import get_cooc
......
......@@ -132,8 +132,6 @@ class CustomForm(forms.Form):
# raise forms.ValidationError(_('We need a zip pls.'))
return file_
class CorpusForm(ModelForm):
#parent = ModelChoiceField(EmptyQuerySet)
def __init__(self, *args, **kwargs):
......
......@@ -222,12 +222,14 @@ class Node(CTENode):
associations = defaultdict(float) # float or int?
if isinstance(keys, dict):
for key, weight in keys.items():
for ngram in extractor.extract_ngrams(self.metadata[key]):
text2process = str(self.metadata[key]).replace('[','').replace(']','')
for ngram in extractor.extract_ngrams(text2process):
terms = ' '.join([token for token, tag in ngram])
associations[ngram] += weight
else:
for key in keys:
for ngram in extractor.extract_ngrams(self.metadata[key]):
text2process = str(self.metadata[key]).replace('[','').replace(']','')
for ngram in extractor.extract_ngrams(text2process):
terms = ' '.join([token for token, tag in ngram])
associations[terms] += 1
Node_Ngram.objects.bulk_create([
......@@ -318,18 +320,21 @@ class Node(CTENode):
language = langages_cache[metadata_values['language_iso2']] if 'language_iso2' in metadata_values else None,
if isinstance(language, tuple):
language = language[0]
Node(
node = Node(
user_id = user_id,
type_id = type_id,
name = name,
parent = self,
language_id = language.id if language else None,
metadata = metadata_values
).save()
)
node.save()
metadata_values["id"] = node.id
# # make metadata filterable
self.children.all().make_metadata_filterable()
# # mark the resources as parsed for this node
self.node_resource.update(parsed=True)
return metadata_list
def extract_ngrams__MOV(self, array , keys , ngramsextractorscache=None, ngramscaches=None):
if ngramsextractorscache is None:
......@@ -369,7 +374,7 @@ class Node(CTENode):
associations[terms] += 1
if(len(associations)>0):
results.append( [i , associations] )
results.append( [metadata["id"] , associations] )
i+=1
return results
......@@ -421,7 +426,7 @@ class Node(CTENode):
ngramid+=1
# *03* [ / making dictionaries for NGram_Text <=> NGram_ID ]
docs_X_terms = {}
for i in FreqList: # foreach ID in Doc:
docID = i[0]
associations = i[1]
......@@ -435,9 +440,10 @@ class Node(CTENode):
ngrams_by_document = termsCount # i re-calculed this because of *02*
terms = []
terms_occ = []
if ngrams_by_document > 0:
for ngram_text, weight in associations.items():
if ngram_text in NGram2ID:
if ngram_text in NGram2ID:
terms.append(NGram2ID[ngram_text])
# [ calculating TF-IDF ]
occurrences_of_ngram = weight
......@@ -446,6 +452,9 @@ class Node(CTENode):
yy = FirstNgrams[ngram_text]["C"]
inverse_document_frequency= log(xx/yy) #log base e
tfidfScore = term_frequency*inverse_document_frequency
terms_occ.append( [ NGram2ID[ngram_text] , round(tfidfScore,3) ] )
# [ / calculating TF-IDF ]
if "T" in FirstNgrams[ngram_text]:
FirstNgrams[ngram_text]["T"].append(tfidfScore)
......@@ -453,9 +462,13 @@ class Node(CTENode):
FirstNgrams[ngram_text]["T"] = [tfidfScore]
if len(terms)>1:
docs_X_terms[docID] = terms_occ
# print("docid:",docID)
# for i in terms:
# print("\t",ID2NGram[i])
calc.addCompleteSubGraph(terms)
return { "G":calc.G , "TERMS": ID2NGram , "metrics":FirstNgrams }
return { "G":calc.G , "TERMS": ID2NGram , "ii":docs_X_terms ,"metrics":FirstNgrams }
def do_coocmatrix__MOV(self , TERMS , G , n=150 , type='node_link'):
import pandas as pd
......@@ -475,20 +488,19 @@ class Node(CTENode):
n1 = e[0]
n2 = e[1]
w = G[n1][n2]['weight']
# print("\t",n1," <=> ",n2, " : ", G[n1][n2]['weight'],"\t",TERMS[n1]," <=> ",TERMS[n2], " : ", G[n1][n2]['weight'])
# print(n1," <=> ",n2, " : ", G[n1][n2]['weight'],"\t",TERMS[n1]," <=> ",TERMS[n2], "\t", G[n1][n2]['weight'])
ids[TERMS[n1]] = n1
ids[TERMS[n2]] = n2
labels[n1] = TERMS[n1]
labels[n2] = TERMS[n2]
matrix[n1][n2] = w
matrix[n2][n1] = w
matrix[ n1 ][ n2 ] = w
matrix[ n2 ][ n1 ] = w
weight[n2] = weight.get( n2, 0) + w
weight[n1] = weight.get( n1, 0) + w
weight[TERMS[n2]] = weight.get(TERMS[n2], 0) + w
weight[TERMS[n1]] = weight.get(TERMS[n1], 0) + w
print("\n===================\nNUMBER OF NGRAMS:",len(weight.keys()))
df = pd.DataFrame(matrix).fillna(0)
x = copy(df.values)
x = x / x.sum(axis=1)
......@@ -499,27 +511,23 @@ class Node(CTENode):
#matrix_filtered = np.where(x > threshold, x, 0)
#matrix_filtered = matrix_filtered.resize((90,90))
G = nx.from_numpy_matrix(matrix_filtered)
G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(df.columns)])))
print("NUMBER OF NODES:",len(G))
# G = nx.relabel_nodes(G, dict(enumerate([ labels[label] for label in list(df.columns)])))
partition = best_partition(G)
data = []
if type == "node_link":
for community in set(partition.values()):
#print(community)
G.add_node("cluster " + str(community), hidden=1)
for node in G.nodes():
try:
#node,type(labels[node])
G.node[node]['label'] = node
G.node[node]['name'] = node
G.node[node]['pk'] = ids[str(node)]
G.node[node]['label'] = TERMS[node]
G.node[node]['pk'] = node
G.node[node]['size'] = weight[node]
G.node[node]['group'] = partition[node]
G.add_edge(node, "cluster " + str(partition[node]), weight=3)
except Exception as error:
print(error)
print("IMA IN node_link CASE")
print("ERROR:",error)
data = json_graph.node_link_data(G)
elif type == "adjacency":
......@@ -533,10 +541,8 @@ class Node(CTENode):
#G.add_edge(node, partition[node], weight=3)
except Exception as error:
print(error)
print("IMA IN adjacency CASE")
data = json_graph.node_link_data(G)
print("* * * * FINISHED * * * *")
return data
......@@ -554,14 +560,14 @@ class Node(CTENode):
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" parse_resources()__MOV [s]",(end - start))
print("LOG::TIME: In workflow() writeMetadata__MOV()")
start = time.time()
self.writeMetadata__MOV( metadata_list=theMetadata )
theMetadata = self.writeMetadata__MOV( metadata_list=theMetadata )
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" writeMetadata__MOV() [s]",(end - start))
print("LOG::TIME: In workflow() extract_ngrams__MOV()")
start = time.time()
FreqList = self.extract_ngrams__MOV(theMetadata , keys=['title'] )
......@@ -580,10 +586,14 @@ class Node(CTENode):
start = time.time()
print("LOG::TIME: In workflow() do_coocmatrix()")
jsongraph = self.do_coocmatrix__MOV ( resultDict["TERMS"] , resultDict["G"] , n=150)
jsongraph["stats"] = resultDict["ii"]
end = time.time()
total += (end - start)
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" do_coocmatrix() [s]",(end - start))
# import pprint
# pprint.pprint(jsongraph)
print("the user:",self.user)
print("the project id:",self.parent.id)
print("the corpus id:",self.id)
......
......@@ -25,7 +25,7 @@ class PubmedFileParser(FileParser):
metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle',
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
# "abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"title2" : 'MedlineCitation/Article/VernacularTitle',
"language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[@type=doi]',
......
......@@ -31,7 +31,7 @@ def getGlobalStats(request ):
alist = ["bar","foo"]
if request.method == "POST":
N = 100
N = 10
query = request.POST["query"]
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" query =", query )
print ("LOG::TIME:_ "+datetime.datetime.now().isoformat()+" N =", N )
......@@ -73,8 +73,6 @@ def doTheQuery(request , project_id):
alist = ["hola","mundo"]
if request.method == "POST":
query = request.POST["query"]
name = request.POST["string"]
......@@ -101,14 +99,12 @@ def doTheQuery(request , project_id):
type_id = NodeType.objects.get(name='Document').id
user_id = User.objects.get( username=request.user ).id
corpus = Node(
user=request.user,
parent=parent,
type=node_type,
name=name,
)
corpus.save()
tasks = MedlineFetcher()
......@@ -132,12 +128,12 @@ def doTheQuery(request , project_id):
# do the WorkFlow
try:
if DEBUG is True:
# corpus.workflow() # old times...
corpus.workflow__MOV()
corpus.workflow() # old times...
# corpus.workflow__MOV()
# corpus.write_everything_to_DB()
else:
# corpus.workflow.apply_async((), countdown=3)
corpus.workflow__MOV() # synchronous! because is faaast
corpus.workflow.apply_async((), countdown=3)
# corpus.workflow__MOV() # synchronous! because is faaast
# corpus.write_everything_to_DB.apply_async((), countdown=3) # asynchronous
......
......@@ -260,8 +260,8 @@
</div>
<div id="topPapers"></div>
<!--
<div id="tab-container-top" class='tab-container'>
<ul class='etabs'>
......@@ -278,6 +278,7 @@
</div>
</div>
</div>
-->
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment