Commit a2f0376a authored by delanoe's avatar delanoe

Merge branch 'romain-refactoring' into unstable

parents 766250e3 049dc862
......@@ -126,10 +126,27 @@ class Parser:
break
except KeyError:
language_keyerrors[key] = language_symbol
# languages can find Language objects from any code iso2 or iso3
# --------------------------------------------------------------
# > languages['fr']
# <Language iso3="fra" iso2="fr" implemented="True" name="French">
# > languages['fra']
# <Language iso3="fra" iso2="fr" implemented="True" name="French">
if language is not None:
hyperdata['language_iso2'] = language.iso2
hyperdata['language_iso3'] = language.iso3
hyperdata['language_name'] = language.name
hyperdata['language_iso3'] = language.iso3
if (language.iso2 is not None):
# NB: language can be recognized through iso3 but have no iso2!!
# because there's *more* languages in iso3 codes (iso-639-3)
# exemple:
# > languages['dnj']
# <Language iso3="dnj" iso2="None" implemented="False" name="Dan">
# ----
hyperdata['language_iso2'] = language.iso2
else:
# 'None' would become json 'null' ==> "__unknown__" more stable
hyperdata['language_iso2'] = "__unknown__"
elif language_keyerrors:
print('Unrecognized language: %s' % ', '.join(
'%s="%s"' % (key, value) for key, value in language_keyerrors.items()
......
......@@ -54,6 +54,10 @@ def parse_extract_indexhyperdata(corpus):
corpus.status('Workflow', progress=1)
corpus.save_hyperdata()
session.commit()
# FIXME: 'Workflow' will still be uncomplete when 'Index' and 'Lists' will
# get stacked into hyperdata['statuses'], but doing corpus.status()
# will return only the 1st uncomplete action (corpus.status() doesn't
# understand "subactions")
# apply actions
print('CORPUS #%d' % (corpus.id))
......
This diff is collapsed.
......@@ -56,7 +56,7 @@ def extract_ngrams(corpus, keys=('title', 'abstract', )):
ngramsextractor = ngramsextractors[language_iso2]
except KeyError:
# skip document
print('Unsupported language: `%s`' % (language_iso2, ))
print('Unsupported language: `%s` (doc #%i)' % (language_iso2, document.id))
# and remember that for later processes (eg stemming)
document.hyperdata['__skipped__'] = 'ngrams_extraction'
document.save_hyperdata()
......
......@@ -74,8 +74,10 @@ def _query_grouped_ngrams(groupings_id, details=False, scoring_metric_id=None):
- details: if False, send just the array of ngram_ids
if True, send triples with (ngram_id, term, scoring)
^^^^^^^
- scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
deprecated: scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
(no more OCCS counts of subforms)
"""
if not details:
# simple contents
......@@ -86,12 +88,12 @@ def _query_grouped_ngrams(groupings_id, details=False, scoring_metric_id=None):
.query(
NodeNgramNgram.ngram2_id,
Ngram.terms,
NodeNodeNgram.score
# NodeNodeNgram.score #
)
.join(Ngram, NodeNgramNgram.ngram2_id == Ngram.id)
.join(NodeNodeNgram, NodeNgramNgram.ngram2_id == NodeNodeNgram.ngram_id)
.filter(NodeNodeNgram.node1_id == scoring_metric_id)
.order_by(desc(NodeNodeNgram.score))
# .join(NodeNodeNgram, NodeNgramNgram.ngram2_id == NodeNodeNgram.ngram_id)
# .filter(NodeNodeNgram.node1_id == scoring_metric_id)
# .order_by(desc(NodeNodeNgram.score))
)
# main filter
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment