Commit 92fc4072 authored by Administrator's avatar Administrator

[FEAT] Adding TFIDF functions.

parent 5dd8ff49
from node.models import Node, NodeType, Node_Resource,\
Project, Corpus, Document,\
Ngram, Node_Ngram, NodeNgramNgram, NodeNodeNgram,\
User, Language, ResourceType, Resource
from math import log
# - tfidf / corpus , type de corpus, tous corpus
# - tfidf / échelle de temps
# - tfidf / sources, auteurs etc.
# => liste de listes
def tfidf(corpus, document, ngram):
try:
x = Node_Ngram.objects.get(node=document, ngram=ngram).weight
y = Node_Ngram.objects.filter(node=document).count()
tf = x/y
xx = Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")).count()
yy = Node_Ngram.objects.filter(ngram=ngram).count()
idf= log(xx/yy)
result = tf * idf
except Exception as error:
print(error)
result = 0
return result
def do_tfidf(corpus, reset=True):
with transaction.atomic():
if reset==True:
NodeNodeNgram.objects.filter(nodex=corpus).delete()
if isinstance(corpus, Node) and corpus.type.name == "Corpus":
for document in Node.objects.filter(parent=corpus, type=NodeType.objects.get(name="Document")):
for node_ngram in Node_Ngram.objects.filter(node=document):
try:
nnn = NodeNodeNgram.objects.get(nodex=corpus, nodey=document, ngram=node_ngram.ngram)
except:
score = tfidf(corpus, document, node_ngram.ngram)
nnn = NodeNodeNgram(nodex=corpus, nodey=node_ngram.node, ngram=node_ngram.ngram, score=score)
nnn.save()
else:
print("Only implemented for corpus yet, whereas you put:", type(corpus))
...@@ -237,8 +237,8 @@ def project(request, project_id): ...@@ -237,8 +237,8 @@ def project(request, project_id):
) )
try: try:
corpus.parse_and_extract_ngrams() corpus.workflow()
#corpus.parse_and_extract_ngrams.apply_async((), countdown=3) #corpus.workflow((), countdown=3)
except Exception as error: except Exception as error:
print(error) print(error)
......
...@@ -69,6 +69,15 @@ except Exception as error: ...@@ -69,6 +69,15 @@ except Exception as error:
typeStem = NodeType(name='Stem') typeStem = NodeType(name='Stem')
typeStem.save() typeStem.save()
try:
typeTfidf = NodeType.objects.get(name='Tfidf')
except Exception as error:
print(error)
typeTfidf = NodeType(name='Tfidf')
typeTfidf.save()
try: try:
typeDoc = NodeType.objects.get(name='WhiteList') typeDoc = NodeType.objects.get(name='WhiteList')
except Exception as error: except Exception as error:
...@@ -147,3 +156,5 @@ except: ...@@ -147,3 +156,5 @@ except:
stem.save() stem.save()
...@@ -234,7 +234,7 @@ class Node(CTENode): ...@@ -234,7 +234,7 @@ class Node(CTENode):
]) ])
@current_app.task(filter=task_method) @current_app.task(filter=task_method)
def parse_and_extract_ngrams(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False): def workflow(self, keys=None, ngramsextractorscache=None, ngramscaches=None, verbose=False):
self.parse_resources() self.parse_resources()
type_document = NodeType.objects.get(name='Document') type_document = NodeType.objects.get(name='Document')
self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',]) self.children.filter(type_id=type_document.pk).extract_ngrams(keys=['title',])
...@@ -298,4 +298,26 @@ class NodeNgramNgram(models.Model): ...@@ -298,4 +298,26 @@ class NodeNgramNgram(models.Model):
return "%s: %s / %s" % (self.node.name, self.ngramx.terms, self.ngramy.terms) return "%s: %s / %s" % (self.node.name, self.ngramx.terms, self.ngramy.terms)
class NodeNodeNgram(models.Model):
nodex = models.ForeignKey(Node, related_name="nodex")
nodey = models.ForeignKey(Node, related_name="nodey")
ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE)
score = models.FloatField(default=0)
def __str__(self):
return "%s: %s / %s = %s" % (self.nodex.name, self.nodey.name, self.ngram.terms, self.score)
class NodeNodeNgram(models.Model):
nodex = models.ForeignKey(Node, related_name="nodex")
nodey = models.ForeignKey(Node, related_name="nodey")
ngram = models.ForeignKey(Ngram, on_delete=models.CASCADE)
score = models.FloatField(default=0)
def __str__(self):
return "%s: %s / %s = %s" % (self.nodex.name, self.nodey.name, self.ngram.terms, self.score)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment