Commit e42368f3 authored by PkSM3's avatar PkSM3

[UPDATE] conflict merge solved

parents 60fc2c08 d18065e7
...@@ -249,9 +249,10 @@ def tfidf(corpus, document, ngram): ...@@ -249,9 +249,10 @@ def tfidf(corpus, document, ngram):
xx = Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")).count() xx = Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")).count()
yy = Node_Ngram.objects.filter(ngram=ngram).count() yy = Node_Ngram.objects.filter(ngram=ngram).count()
idf= log(xx/yy) inverse_d_frequency= log(xx/yy)
result = tf * idf # result = tf * idf
result = term_frequency * inverse_d_frequency
except Exception as error: except Exception as error:
print(error) print(error)
result = 0 result = 0
......
from node.models import Node, NodeType, Node_Resource,\
Project, Corpus, Document,\
Ngram, Node_Ngram, NodeNgramNgram, NodeNodeNgram,\
User, Language, ResourceType, Resource
from math import log
# - tfidf / corpus , type de corpus, tous corpus
# - tfidf / échelle de temps
# - tfidf / sources, auteurs etc.
# => liste de listes
def tfidf(corpus, document, ngram):
try:
x = Node_Ngram.objects.get(node=document, ngram=ngram).weight
y = Node_Ngram.objects.filter(node=document).count()
tf = x/y
xx = Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")).count()
yy = Node_Ngram.objects.filter(ngram=ngram).count()
idf= log(xx/yy)
result = tf * idf
except Exception as error:
print(error)
result = 0
return result
def do_tfidf(corpus, reset=True):
with transaction.atomic():
if reset==True:
NodeNodeNgram.objects.filter(nodex=corpus).delete()
if isinstance(corpus, Node) and corpus.type.name == "Corpus":
for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
for node_ngram in Node_Ngram.objects.filter(node=document):
try:
nnn = NodeNodeNgram.objects.get(nodex=corpus, nodey=document, ngram=node_ngram.ngram)
except:
score = tfidf(corpus, document, node_ngram.ngram)
nnn = NodeNodeNgram(nodex=corpus, nodey=node_ngram.node, ngram=node_ngram.ngram, score=score)
nnn.save()
else:
print("Only implemented for corpus yet, whereas you put:", type(corpus))
...@@ -17,29 +17,33 @@ urlpatterns = patterns('', ...@@ -17,29 +17,33 @@ urlpatterns = patterns('',
url(r'^login/', include(admin.site.urls)), url(r'^login/', include(admin.site.urls)),
url(r'^grappelli/', include('grappelli.urls')), url(r'^grappelli/', include('grappelli.urls')),
# User views # User Home view
url(r'^$', views.home), url(r'^$', views.home),
# Project Management
url(r'^projects/$', views.projects), url(r'^projects/$', views.projects),
url(r'^project/(\d+)/delete/$', views.delete_project), url(r'^project/(\d+)/delete/$', views.delete_project),
url(r'^project/(\d+)/$', views.project), url(r'^project/(\d+)/$', views.project),
# Corpus management
url(r'^project/(\d+)/corpus/(\d+)/$', views.corpus), url(r'^project/(\d+)/corpus/(\d+)/$', views.corpus),
url(r'^project/(\d+)/corpus/(\d+)/delete/$', views.delete_corpus), url(r'^project/(\d+)/corpus/(\d+)/delete/$', views.delete_corpus),
url(r'^project/(\d+)/corpus/(\d+)/corpus.csv$', views.corpus_csv),
url(r'^project/(\d+)/corpus/(\d+)/timerange/(\d+)/(\d+)$', views.subcorpus), url(r'^project/(\d+)/corpus/(\d+)/timerange/(\d+)/(\d+)$', views.subcorpus),
# Visualizations # Visualizations
url(r'^corpus/(\d+)/explorer$', views.explorer_graph), url(r'^project/(\d+)/corpus/(\d+)/chart$', views.chart),
url(r'^corpus/(\d+)/matrix$', views.explorer_matrix), url(r'^corpus/(\d+)/explorer$', views.graph),
url(r'^corpus/(\d+)/matrix$', views.matrix),
# Getting data # Data management
url(r'^chart/corpus/(\d+)/data.csv$', views.send_csv), url(r'^chart/corpus/(\d+)/data.csv$', views.send_csv),
url(r'^corpus/(\d+)/node_link.json$', views.node_link), url(r'^corpus/(\d+)/node_link.json$', views.node_link),
url(r'^corpus/(\d+)/adjacency.json$', views.adjacency), url(r'^corpus/(\d+)/adjacency.json$', views.adjacency),
url(r'^api/tfidf/(\d+)/(\w+)$', views.tfidf), url(r'^api/tfidf/(\d+)/(\w+)$', views.tfidf),
# Data management
url(r'^api$', gargantext_web.api.Root), url(r'^api$', gargantext_web.api.Root),
url(r'^api/nodes/(\d+)/children/metadata$', gargantext_web.api.NodesChildrenMetatadata.as_view()), url(r'^api/nodes/(\d+)/children/metadata$', gargantext_web.api.NodesChildrenMetatadata.as_view()),
url(r'^api/nodes/(\d+)/children/queries$', gargantext_web.api.NodesChildrenQueries.as_view()), url(r'^api/nodes/(\d+)/children/queries$', gargantext_web.api.NodesChildrenQueries.as_view()),
...@@ -50,7 +54,6 @@ urlpatterns = patterns('', ...@@ -50,7 +54,6 @@ urlpatterns = patterns('',
url(r'^api/nodes/(\d+)/ngrams$', gargantext_web.api.CorpusController.ngrams), url(r'^api/nodes/(\d+)/ngrams$', gargantext_web.api.CorpusController.ngrams),
url(r'^graph-it$', views.graph_it),
url(r'^ngrams$', views.ngrams), url(r'^ngrams$', views.ngrams),
url(r'^nodeinfo/(\d+)$', views.nodeinfo), url(r'^nodeinfo/(\d+)$', views.nodeinfo),
url(r'^tests/mvc$', views.tests_mvc), url(r'^tests/mvc$', views.tests_mvc),
......
...@@ -285,8 +285,8 @@ def corpus(request, project_id, corpus_id): ...@@ -285,8 +285,8 @@ def corpus(request, project_id, corpus_id):
project = Node.objects.get(id=project_id) project = Node.objects.get(id=project_id)
corpus = Node.objects.get(id=corpus_id) corpus = Node.objects.get(id=corpus_id)
#documents = corpus.children.all() type_doc = NodeType.objects.get(name="Document")
#number = corpus.children.count() number = Node.objects.filter(parent=corpus, type=type_doc).count()
# try: # try:
# sources = defaultdict(int) # sources = defaultdict(int)
...@@ -357,7 +357,7 @@ def corpus(request, project_id, corpus_id): ...@@ -357,7 +357,7 @@ def corpus(request, project_id, corpus_id):
'project': project,\ 'project': project,\
'corpus' : corpus,\ 'corpus' : corpus,\
'documents': documents,\ 'documents': documents,\
# 'number' : number,\ 'number' : number,\
'dates' : chart,\ 'dates' : chart,\
})) }))
...@@ -512,8 +512,22 @@ def delete_corpus(request, project_id, corpus_id): ...@@ -512,8 +512,22 @@ def delete_corpus(request, project_id, corpus_id):
Node.objects.filter(id=corpus_id).all().delete() Node.objects.filter(id=corpus_id).all().delete()
return HttpResponseRedirect('/project/' + project_id) return HttpResponseRedirect('/project/' + project_id)
def explorer_graph(request, corpus_id):
t = get_template('explorer.html') def chart(request, project_id, corpus_id):
''' Charts to compare, filter, count'''
t = get_template('chart.html')
user = request.user
date = datetime.datetime.now()
project = Node.objects.get(id=project_id)
html = t.render(Context({
'user': user,
'date': date,
'project' : project,
}))
return HttpResponse(html)
def matrix(request, corpus_id):
t = get_template('matrix.html')
user = request.user user = request.user
date = datetime.datetime.now() date = datetime.datetime.now()
corpus = Node.objects.get(id=corpus_id) corpus = Node.objects.get(id=corpus_id)
...@@ -526,8 +540,8 @@ def explorer_graph(request, corpus_id): ...@@ -526,8 +540,8 @@ def explorer_graph(request, corpus_id):
return HttpResponse(html) return HttpResponse(html)
def explorer_matrix(request, corpus_id): def graph(request, corpus_id):
t = get_template('matrix.html') t = get_template('explorer.html')
user = request.user user = request.user
date = datetime.datetime.now() date = datetime.datetime.now()
corpus = Node.objects.get(id=corpus_id) corpus = Node.objects.get(id=corpus_id)
...@@ -540,6 +554,10 @@ def explorer_matrix(request, corpus_id): ...@@ -540,6 +554,10 @@ def explorer_matrix(request, corpus_id):
return HttpResponse(html) return HttpResponse(html)
def exploration(request): def exploration(request):
t = get_template('exploration.html') t = get_template('exploration.html')
user = request.user user = request.user
...@@ -567,6 +585,36 @@ def explorer_chart(request): ...@@ -567,6 +585,36 @@ def explorer_chart(request):
import csv import csv
from django.db import connection from django.db import connection
def corpus_csv(request, project_id, corpus_id):
'''
Create the HttpResponse object with the appropriate CSV header.
'''
response = HttpResponse(content_type='text/csv')
response['Content-Disposition'] = 'attachment; filename="corpus.csv"'
writer = csv.writer(response)
corpus = Node.objects.get(id=corpus_id)
type_document = NodeType.objects.get(name="Document")
documents = Node.objects.filter(parent=corpus, type=type_document)
keys = list(documents[0].metadata.keys())
writer.writerow(keys)
for doc in documents:
data = list()
for key in keys:
try:
data.append(doc.metadata[key])
except:
data.append("")
writer.writerow(data)
return response
def send_csv(request, corpus_id): def send_csv(request, corpus_id):
''' '''
Create the HttpResponse object with the appropriate CSV header. Create the HttpResponse object with the appropriate CSV header.
......
...@@ -2,6 +2,7 @@ import re ...@@ -2,6 +2,7 @@ import re
import locale import locale
from lxml import etree from lxml import etree
from datetime import datetime, date from datetime import datetime, date
from django.utils import timezone
from .FileParser import FileParser from .FileParser import FileParser
from ..NgramsExtractors import * from ..NgramsExtractors import *
...@@ -17,14 +18,17 @@ class EuropressFileParser(FileParser): ...@@ -17,14 +18,17 @@ class EuropressFileParser(FileParser):
if isinstance(file, str): if isinstance(file, str):
file = open(file, 'rb') file = open(file, 'rb')
print(file) #print(file)
contents = file.read() contents = file.read()
print(len(contents)) #print(len(contents))
#return [] #return []
encoding = self.detect_encoding(contents) encoding = self.detect_encoding(contents)
print(encoding)
if encoding != "utf-8":
contents = contents.decode(encoding, errors='replace').encode(codif)
try: try:
html_parser = etree.HTMLParser(encoding=encoding) html_parser = etree.HTMLParser(encoding=codif)
html = etree.fromstring(contents, html_parser) html = etree.fromstring(contents, html_parser)
html_articles = html.xpath('/html/body/table') html_articles = html.xpath('/html/body/table')
except: except:
...@@ -54,7 +58,7 @@ class EuropressFileParser(FileParser): ...@@ -54,7 +58,7 @@ class EuropressFileParser(FileParser):
if isinstance(text, bytes): if isinstance(text, bytes):
text = text.decode(encoding) text = text.decode(encoding)
format_date_fr = re.compile('\d+\s*\w+\s+\d{4}', re.UNICODE) format_date_fr = re.compile('\d*\s*\w+\s+\d{4}', re.UNICODE)
test_date_fr = format_date_fr.match(text) test_date_fr = format_date_fr.match(text)
format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE) format_date_en = re.compile('\w+\s+\d+,\s+\d{4}', re.UNICODE)
...@@ -69,12 +73,18 @@ class EuropressFileParser(FileParser): ...@@ -69,12 +73,18 @@ class EuropressFileParser(FileParser):
if test_date_fr is not None: if test_date_fr is not None:
self.localeEncoding = "fr_FR" self.localeEncoding = "fr_FR"
locale.setlocale(locale.LC_ALL, localeEncoding) locale.setlocale(locale.LC_ALL, localeEncoding)
if encoding != "utf-8":
text = text.replace('י', 'é')
text = text.replace('ű', 'û')
text = text.replace(' aot ', ' août ')
try : try :
metadata['publication_date'] = datetime.strptime(text, '%d %B %Y') metadata['publication_date'] = datetime.strptime(text, '%d %B %Y')
except : except :
try: try:
metadata['publication_date'] = datetime.strptime(text, '%B %Y') metadata['publication_date'] = datetime.strptime(text, '%B %Y')
except : except :
print(text)
pass pass
if test_date_en is not None: if test_date_en is not None:
...@@ -122,7 +132,7 @@ class EuropressFileParser(FileParser): ...@@ -122,7 +132,7 @@ class EuropressFileParser(FileParser):
try: try:
back = metadata['publication_date'] back = metadata['publication_date']
except Exception as e: except Exception as e:
print(e) #print(e)
pass pass
else: else:
try: try:
...@@ -130,7 +140,7 @@ class EuropressFileParser(FileParser): ...@@ -130,7 +140,7 @@ class EuropressFileParser(FileParser):
except Exception as e: except Exception as e:
print(e) print(e)
except : except :
metadata['publication_date'] = datetime.now() metadata['publication_date'] = timezone.now()
#if lang == 'fr': #if lang == 'fr':
#metadata['language_iso2'] = 'fr' #metadata['language_iso2'] = 'fr'
...@@ -161,12 +171,5 @@ class EuropressFileParser(FileParser): ...@@ -161,12 +171,5 @@ class EuropressFileParser(FileParser):
# pprint(metadata_list) # pprint(metadata_list)
# return [] # return []
return metadata_list return metadata_list
#
from NgramsExtractors import *
from Taggers import *
#texts = [
# "This is quite a simple test.",
# "Forman Brown (1901–1996) was one of the world's leaders in puppet theatre in his day, as well as an important early gay novelist. He was a member of the Yale Puppeteers and the driving force behind Turnabout Theatre. He was born in Otsego, Michigan, in 1901 and died in 1996, two days after his 95th birthday. Brown briefly taught at North Carolina State College, followed by an extensive tour of Europe.",
# "James Patrick (born c. 1940) is the pseudonym of a Scottish sociologist, which he used to publish a book A Glasgow Gang Observed. It attracted some attention in Scotland when it was published in 1973. It was based on research he had done in 1966, when he was aged 26. At that time he was working as a teacher in an Approved School, a Scottish reformatory. One gang member in the school, \"Tim Malloy\" (born 1950, also a pseudonym and a generic term for a Glasgow Catholic), agreed to infiltrate him into his gang in Maryhill in Glasgow. Patrick spent four months as a gang member, observing their behaviour.",
#]
#tagger = NltkTagger()
#extractor = EnglishNgramsExtractor()
#
texts = [
"La saison 1921-1922 du Foot-Ball Club Juventus est la vingtième de l'histoire du club, créé vingt-cinq ans plus tôt en 1897. La société turinoise qui fête cette année son 25e anniversaire prend part à l'édition du championnat dissident d'Italie de la CCI (appelé alors la Première division), la dernière édition d'une compétition annuelle de football avant l'ère fasciste de Mussolini.",
"Le terme oblong désigne une forme qui est plus longue que large et dont les angles sont arrondis. En langage bibliographique, oblong signifie un format dont la largeur excède la hauteur. Ce qui correspond au format paysage en termes informatiques et \"à l'italienne\", pour l'imprimerie.",
"Les sanglots longs des violons de l'automne bercent mon coeur d'une langueur monotone.",
]
tagger = TreeTagger()
extractor = FrenchNgramsExtractor()
for text in texts:
print(tagger.tag_text(text))
print()
ngrams = extractor.extract_ngrams(text)
for ngram in ngrams:
print("\t" + str(ngram))
print("\n")
This diff is collapsed.
...@@ -23,17 +23,19 @@ ...@@ -23,17 +23,19 @@
<div class="container theme-showcase" role="main"> <div class="container theme-showcase" role="main">
<div class="jumbotron"> <div class="jumbotron">
{% if project %} {% if project %}
<h1>{{ project.name }} </h1> <h1>{{ project.name }}, {{ corpus.name }}
</h1>
{% endif %} {% endif %}
{% if corpus %} {% if corpus %}
{{ corpus.name }} <p>
, Created on {{ corpus.date }} ({{ number}} docs)</p> {{ number}} docs, Created on {{ corpus.date }}
</p>
{% endif %} {% endif %}
<a class="btn btn-primary btn-lg" role="button" href="/admin/documents/corpus/{{ corpus.pk }}/">Import</a> <!-- <a class="btn btn-primary btn-lg" role="button" href="/admin/documents/corpus/{{ corpus.pk }}/">Add file</a> --!>
<a class="btn btn-primary btn-lg" role="button" href="/admin/documents/corpus/{{ corpus.pk }}/">Export</a> <a class="btn btn-primary btn-lg" role="button" href="/project/{{project.pk}}/corpus/{{ corpus.pk }}/corpus.csv">Save as</a>
<a class="btn btn-primary btn-lg" role="button" href="/admin/documents/corpus/{{ corpus.pk }}/">Delete</a></p> <a class="btn btn-primary btn-lg" role="button" href="/project/{{project.pk}}/corpus/{{ corpus.pk }}/delete">Delete</a></p>
{% if number == 0 %} {% if number == 0 %}
<a class="btn btn-primary btn-lg" role="button" href="/admin/documents/corpus/{{ corpus.pk }}/">Add documents</a></p> <a class="btn btn-primary btn-lg" role="button" href="/admin/documents/corpus/{{ corpus.pk }}/">Add documents</a></p>
...@@ -48,7 +50,7 @@ ...@@ -48,7 +50,7 @@
<div class="row"> <div class="row">
<div id="monthly-move-chart"> <div id="monthly-move-chart">
<center> <center>
<strong>Title</strong> (Blue Line: Publications by months, Green Line: Zoomable publications) <strong>Title</strong> (Blue bars: all, Green line: zoom)
<a class="reset" href="javascript:volumeChart.filterAll();dc.redrawAll();" <a class="reset" href="javascript:volumeChart.filterAll();dc.redrawAll();"
style="display: none;">reset</a> style="display: none;">reset</a>
<div class="clearfix"></div> <div class="clearfix"></div>
...@@ -96,34 +98,37 @@ ...@@ -96,34 +98,37 @@
<div class="row"> <div class="row">
<div class="col-md-4"> <div class="col-md-4">
<div class="jumbotron"> <div class="jumbotron">
<h3><a href="/graph-it">Documents</a></h3> <h3><a href="/project/{{project.id}}/corpus/{{corpus.id}}/chart">Advanced charts</a></h3>
<ol> <ol>
<li>Read</li> <!-- write --> <li>Count</li> <!-- read, compute -->
<li>Count</li> <!-- compute --> <li>Filter</li> <!-- count, compute -->
<li>Select</li> <!-- cut --> <li>Compare</li> <!-- select, cut -->
</ol> </ol>
<h4><a href="/project/{{project.id}}/corpus/{{corpus.id}}/">Back to corpus</a></h3>
</div> </div>
</div> </div>
<div class="col-md-4"> <div class="col-md-4">
<div class="jumbotron"> <div class="jumbotron">
<h3><a href="/ngrams">Dictionaries</a></h3> <h3><a href="/corpus/{{corpus.id}}/matrix">Matrix</a></h3>
<ol> <ol>
<li>Synonyms</li> <li>Sort</li>
<li>Black Lists</li> <li>Group</li>
<li>White Lists</li> <li>Cluster</li>
</ol> </ol>
<h4><a href="/project/{{project.id}}/corpus/{{corpus.id}}/">Back to corpus</a></h3>
</div> </div>
</div> </div>
<div class="col-md-4"> <div class="col-md-4">
<div class="jumbotron"> <div class="jumbotron">
<h3><a href="/corpus/{{ corpus.id }}/explorer">Visualizations</a></h3> <h3><a href="/corpus/{{ corpus.id }}/explorer">Graph</a></h3>
<ol> <ol>
<li><a href="/corpus/{{ corpus.id }}/matrix">Adjacency matrix</a></li> <li>Visualize</li>
<li><a href="/corpus/{{ corpus.id }}/explorer">Static maps</a></li> <li>Explore</li>
<li>Dynamic maps</li> <li>Read</li>
</ol> </ol>
<h4><a href="/project/{{project.id}}/corpus/{{corpus.id}}/">Back to corpus</a></h3>
</div> </div>
</div> </div>
</div> </div>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment