Commit 8f980cc5 authored by Administrator's avatar Administrator

extract stuff (sql query factor

parent ed5cf272
...@@ -15,10 +15,10 @@ from analysis.languages import english_stem ...@@ -15,10 +15,10 @@ from analysis.languages import english_stem
# from analysis.languages import french_stem as stem # from analysis.languages import french_stem as stem
# print("Selection langue anglaise") # print("Selection langue anglaise")
stemmer = EnglishStemmer() stemmer = EnglishStemmer()
l = set() l = set()
# du format: terms, stems, count
d = defaultdict( lambda:\ d = defaultdict( lambda:\
defaultdict( lambda:\ defaultdict( lambda:\
...@@ -26,48 +26,93 @@ d = defaultdict( lambda:\ ...@@ -26,48 +26,93 @@ d = defaultdict( lambda:\
defaultdict( int )\ defaultdict( int )\
))) )))
#if isinstance(corpus, Corpus) and field in [ column.name for column in Document._meta.fields]: # if isinstance(corpus, Corpus) and field in [ column.name for column in Document._meta.fields]:
def save_newgrams(new_grams):
NgramTemporary.objects.bulk_create(new_grams)
NgramDocumentTemporary.objects.bulk_create(new_gramDoc)
cursor = connection.cursor()
# LOCK TABLE documents_ngramtemporary IN EXCLUSIVE MODE;
query_string = """
INSERT INTO documents_ngram
SELECT * FROM documents_ngramtemporary WHERE NOT EXISTS
( SELECT 1 FROM documents_ngram WHERE
documents_ngram.terms = documents_ngramtemporary.terms);
delete from documents_ngramtemporary;
INSERT INTO
documents_ngramdocument (terms_id, document_id, occurrences)
SELECT
GT.id, DT.id, NDT.occurrences
FROM
documents_ngramdocumenttemporary as NDT
INNER JOIN documents_document AS DT ON DT.id = NDT.document
INNER JOIN documents_ngram AS GT ON GT.terms = NDT.terms ;
delete from documents_ngramdocumenttemporary;
"""
cursor.execute(query_string)
def words_field(corpus=None, field='abstract'): def words_field(corpus=None, field='abstract'):
docs = Document.objects.filter(corpus=corpus) docs = Document.objects.filter(corpus=corpus)
def ngrams(text, grammar_rule='jj_nn'):
def fouille(text, grammar_rule='jj_nn'):
# TODO : grammar_rule # TODO : grammar_rule
from analysis.grammar_rules import jj_nn as rule from analysis.grammar_rules import jj_nn as rule
grammar = nltk.RegexpParser(rule) grammar = nltk.RegexpParser(rule)
#text = clean(text) #text = clean(text)
sentances = nltk.sent_tokenize(text) sentances = nltk.sent_tokenize(text)
result = [] result = []
for sentance in sentances: for sentance in sentances:
try: try:
t = pos_tag(sentance) t = pos_tag(sentance)
g = grammar.parse(t) g = grammar.parse(t)
x = g.subtrees() x = g.subtrees()
while True: while True:
try: try:
subtree = next(x) subtree = next(x)
if subtree.label() == 'NP': if subtree.label() == 'NP':
#print(subtree.label()) #print(subtree.label())
result.append(subtree.leaves()) result.append(subtree.leaves())
except Exception as e: except Exception as e:
break break
except Exception as e: except Exception as e:
print(e) print(e)
pass pass
return iter(result) return iter(result)
def ograms(text, field=doc.abstract)
try:
sentences = nltk.sent_tokenize(field)
words = [ nltk.wordpunct_tokenize(str(sentence)) for sentence in sentences ]
for word in words[0]:
try:
stems = stemmer.stem(str(word))
new = (word, stems, len(stems.split(" ")))
l.add(new)
d[word][doc.id]['count'] = d[word][doc.pk].get('count', 0) + 1
except Exception as e: pass#print(e)
except Exception as e: pass#print(e)
for doc in docs: for doc in docs:
try: try:
sentences = nltk.sent_tokenize(doc.abstract) sentences = nltk.sent_tokenize(doc.abstract)
words = [ nltk.wordpunct_tokenize(str(sentence)) for sentence in sentences ] words = [ nltk.wordpunct_tokenize(str(sentence)) for sentence in sentences ]
for word in words[0]: for word in words[0]:
try: try:
stems = stemmer.stem(str(word)) stems = stemmer.stem(str(word))
...@@ -79,42 +124,15 @@ def words_field(corpus=None, field='abstract'): ...@@ -79,42 +124,15 @@ def words_field(corpus=None, field='abstract'):
# #
except Exception as e: pass#print(e) except Exception as e: pass#print(e)
# l = liste
# du format: terms, stems, count
new_grams = [ Ngram(terms=x[0], stem=x[1], n=x[2]) for x in l] new_grams = [ Ngram(terms=x[0], stem=x[1], n=x[2]) for x in l]
new_gramDoc = [ NgramDocumentTemporary(terms=k, document=pk, occurrences=d[k][pk]['count']) \ new_gramDoc = [ NgramDocumentTemporary(terms=k, document=pk, occurrences=d[k][pk]['count']) \
for k in d.keys() \ for k in d.keys() \
for pk in d[k].keys()\ for pk in d[k].keys() ]
]
NgramTemporary.objects.bulk_create(new_grams)
NgramDocumentTemporary.objects.bulk_create(new_gramDoc)
cursor = connection.cursor()
# LOCK TABLE documents_ngramtemporary IN EXCLUSIVE MODE;
query_string = """
INSERT INTO documents_ngram
SELECT * FROM documents_ngramtemporary WHERE NOT EXISTS
( SELECT 1 FROM documents_ngram WHERE
documents_ngram.terms = documents_ngramtemporary.terms);
delete from documents_ngramtemporary;
INSERT INTO
documents_ngramdocument (terms_id, document_id, occurrences)
SELECT
GT.id, DT.id, NDT.occurrences
FROM
documents_ngramdocumenttemporary as NDT
INNER JOIN documents_document AS DT ON DT.id = NDT.document
INNER JOIN documents_ngram AS GT ON GT.terms = NDT.terms ;
delete from documents_ngramdocumenttemporary;
"""
cursor.execute(query_string)
save_newgrams(new_grams)
def words_fields(corpus=None, fields=['title',]): def words_fields(corpus=None, fields=['title',]):
try: try:
for field in fields: for field in fields:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment