Commit 974ce530 authored by c24b's avatar c24b

INSTALL3

parents 7fe3cec7 bad5e8c5
......@@ -133,8 +133,6 @@
background: white;
font-size: 0.8em;
font-weight: 600;
-webkit-box-shadow: 1px 1px 2px rgba(0, 0, 0, 0.5);
-moz-box-shadow: 1px 1px 2px rgba(0, 0, 0, 0.5);
box-shadow: 1px 1px 2px rgba(0, 0, 0, 0.5);
}
......
......@@ -26,14 +26,113 @@
// ex: projects/1/corpora/2/documents/9/
// ex: projects/1/corpora/2/documents/9/focus=2677 (to highlight ngram 2677 more)
var path = window.location.pathname.match(/\/projects\/(.*)\/corpora\/(.*)\/documents\/(.*)\/(?:focus=([0-9,]+))?/);
// shared vars -------------------
$rootScope.projectId = path[1];
$rootScope.corpusId = path[2];
$rootScope.docId = path[3];
// ex: ["483", "3561", "9754", "35183"]
// (passed from graphExplorer selections)
$rootScope.focusNgrams = path[4].split(",");
if (path[4])
$rootScope.focusNgrams = path[4].split(",");
else
$rootScope.focusNgrams = []
// -------------------------------
// shared toolbox (functions useful for several modules) -------------------
$rootScope.mafonction = function (bidule) {console.warn(bidule)}
// chained recursion to do several AJAX actions and then a callback (eg refresh)
$rootScope.makeChainedCalls =
function (i, listOfCalls, finalCallback, lastCache) {
var callDetails = listOfCalls[i]
console.log(">> calling ajax call ("+(i+1)+"/"+listOfCalls.length+")")
// each callDetails object describes the Ajax call
// and passes the required functions and arguments
// via 3 properties: service, action, params
// ex: callDetails = {
// 'service' : MainApiChangeNgramHttpService,
// 'action' : 'delete'
// 'params' : { 'listId': ..., 'ngramIdList':...}
// there is an optional 4th slot: the dataPropertiesToCache directive
//
// 'dataPropertiesToCache' : ['id'] <== means that on call success
// we will store data.id into
// cache.id for next calls
// }
var service = callDetails['service']
var params = callDetails['params']
var action = callDetails['action']
// cache if we need to store properties of data response for next calls
var cache = {}
if (lastCache) cache = lastCache
// and interpolation of params with this current cache
for (var key in params) {
var val = params[key]
if (typeof val == "object" && val["fromCache"]) {
var propToRead = val["fromCache"]
// console.log("reading from cache: response data property "
// +propToRead+" ("+cache[propToRead]+")")
params[key] = cache[propToRead]
}
}
// Now we run the call
// ex:
// service action
// vvvvv vvvv
// MainApiChangeNgramHttpService["delete"](
// params >>> {'listId': listId, 'ngramIdList': ngramId},
// onsuccess(), onfailure() )
service[action](
params,
// on success
function(data) {
// console.log("SUCCESS:" + action)
// console.log("listOfCalls.length:" + listOfCalls.length)
// console.log("i+1:" + i+1)
// case NEXT
// ----
// when chained actions
if (listOfCalls.length > i+1) {
// if we need to store anything it's the right moment
for (var k in callDetails['dataPropertiesToCache']) {
var prop = callDetails['dataPropertiesToCache'][k]
// console.log("storing in cache: response data property "
// +prop+" ("+data[prop]+")")
cache[prop] = data[prop]
}
// ======= recursive call for next action in list ================
$rootScope.makeChainedCalls(i+1, listOfCalls, finalCallback, cache)
// ================================================================
}
// case LAST
// ------
// when last action
else {
console.log(">> calling refresh")
finalCallback()
}
},
// on error
function(data) {
console.error("unable to call ajax no "+i+" with service "+service.name+
" (http "+action+" with args "+JSON.stringify(params)+")");
}
);
}
// -------------------------------------------------------------------------
// debug
// console.log("==> $rootScope <==")
......
This diff is collapsed.
......@@ -86,48 +86,31 @@
);
});
/*
* NgramHttpService: Create, modify or delete 1 Ngram
* =================
*
* TODO add a create case separately and then remove service
*
* NB : replaced by external api: (MainApiChangeNgramHttpService)
* api/ngramlists/change?list=LISTID&ngrams=ID1,ID2..
*
* old logic:
* ----------
* if new ngram
* -> ngram_id will be "create"
* -> route: annotations/lists/@node_id/ngrams/create
* -> will land on views.NgramCreate
*
* else:
* -> ngram_id is a real ngram id
* -> route: annotations/lists/@node_id/ngrams/@ngram_id
* -> will land on views.NgramEdit
* MainApiAddNgramHttpService: Create and index a new ngram
* ===========================
* route: PUT api/ngrams?text=mynewngramstring&corpus=corpus_id
* ------
*
*/
// http.factory('NgramHttpService', function ($resource) {
// return $resource(
// window.ANNOTATION_API_URL + 'lists/:listId/ngrams/:ngramId',
// {
// listId: '@listId',
// ngramId: '@id'
// },
// {
// post: {
// method: 'POST',
// params: {'listId': '@listId', 'ngramId': '@ngramId'}
// },
// delete: {
// method: 'DELETE',
// params: {'listId': '@listId', 'ngramId': '@ngramId'}
// }
// }
// );
// });
http.factory('MainApiAddNgramHttpService', function($resource) {
return $resource(
// adding explicit "http://" b/c this a cross origin request
'http://' + window.GARG_ROOT_URL
+ "/api/ngrams?text=:ngramStr&corpus=:corpusId",
{
ngramStr: '@ngramStr',
corpusId: '@corpusId'
},
{
put: {
method: 'PUT',
params: {listId: '@listId', ngramIdList: '@ngramIdList'}
}
}
);
});
/*
* MainApiChangeNgramHttpService: Add/remove ngrams from lists
......
......@@ -142,12 +142,16 @@
}
};
});
/*
* new NGram from the user input
*/
annotationsAppNgramList.controller('NgramInputController',
['$scope', '$rootScope', '$element', 'NgramListHttpService',
function ($scope, $rootScope, $element, NgramListHttpService) {
'MainApiChangeNgramHttpService', 'MainApiAddNgramHttpService',
function ($scope, $rootScope, $element, NgramListHttpService,
MainApiChangeNgramHttpService, MainApiAddNgramHttpService) {
/*
* Add a new NGram from the user input in the extra-text list
*/
......@@ -158,11 +162,14 @@
var value = angular.element(inputEltId).val().trim();
if (value === "") return;
// £TEST locally check if already in annotations NodeNgrams ------
// locally check if already in annotations NodeNgrams ------------
// $rootScope.annotations = array of ngram objects like:
// {"list_id":805,"occs":2,"uuid":9386,"text":"petit échantillon"}
// TODO £NEW : lookup obj[list_id][term_text] = {terminfo}
// // $rootScope.lookup =
console.log('looking for "' + value + '" in list:' + listId)
var already_in_list = false ;
angular.forEach($rootScope.annotations, function(annot,i) {
......@@ -177,49 +184,65 @@
if (already_in_list) { return ; }
// ---------------------------------------------------------------
// will check if there's a preexisting ngramId for this value
// TODO: reconnect separately from list addition
// TODO: if maplist => also add to miam
// NgramHttpService.post(
// {
// 'listId': listId,
// 'ngramId': 'create'
// },
// {
// 'text': value
// },
// function(data) {
// console.warn("refresh attempt");
// // on success
// if (data) {
// angular.element(inputEltId).val("");
// // Refresh the annotationss
// NgramListHttpService.get(
// {
// 'corpusId': $rootScope.corpusId,
// 'docId': $rootScope.docId
// },
// function(data) {
// $rootScope.annotations = data[$rootScope.corpusId.toString()][$rootScope.docId.toString()];
//
// // TODO £NEW : lookup obj[list_id][term_text] = {terminfo}
// // $rootScope.lookup =
//
//
// $rootScope.refreshDisplay();
// },
// function(data) {
// console.error("unable to get the list of ngrams");
// }
// );
// }
// }, function(data) {
// // on error
// angular.element(inputEltId).parent().addClass("has-error");
// console.error("error adding Ngram "+ value);
// }
// );
};
// AddNgram
// --------
// creation will return an ngramId
// (checks if there's a preexisting ngramId for this value
// otherwise creates a new one and indexes the ngram in corpus)
MainApiAddNgramHttpService.put(
{
// text <=> str to create the new ngram
'text': value,
'corpusId': $rootScope.corpusId
},
// on AddNgram success
function(data) {
var newNgramId = data.id
console.log("OK created new ngram for '"+value+"' with id: "+newNgramId)
// ChangeNgram
// -----------
// add to listId after creation
// TODO: if maplist => also add to miam
MainApiChangeNgramHttpService["put"](
{
'listId': listId,
'ngramIdList': newNgramId
},
// on ChangeNgram success
function(data) {
// Refresh the annotations (was broken: TODO FIX)
console.warn("refresh attempt");
angular.element(inputEltId).val(""); // what for ???
NgramListHttpService.get(
{
'corpusId': $rootScope.corpusId,
'docId': $rootScope.docId
},
// on refresh success
function(data) {
$rootScope.annotations = data[$rootScope.corpusId.toString()][$rootScope.docId.toString()];
$rootScope.refreshDisplay();
},
// on refresh error
function(data) {
console.error("unable to get the list of ngrams");
}
);
},
// on ChangeNgram error
function(data) {
console.error("unable to edit the Ngram"+ngramId+") on list "+listId+")");
}
);
},
// on AddNgram error
function(data) {
angular.element(inputEltId).parent().addClass("has-error");
console.error("error adding Ngram "+ value);
}
);
}; // onListSubmit
}]);
})(window);
......@@ -121,7 +121,10 @@
<!-- this menu is over the text on mouse selection -->
<div ng-controller="TextSelectionMenuController" id="selection" class="selection-menu">
<ul class="noselection">
<li ng-repeat="item in menuItems" class="{[{item.tgtListName}]}" ng-click="onMenuClick($event, item.crudActions)">Move to {[{item.tgtListName}]}</li>
<li ng-repeat="item in menuItems"
class="{[{item.tgtListName}]}"
ng-click="onMenuClick($event, item.crudCalls)"
>{[{item.comment ? item.comment : 'Move to ' + item.tgtListName}]}</li>
</ul>
</div>
</div>
......
......@@ -18,5 +18,5 @@ urlpatterns = [
# 2016-03-24: refactoring, deactivated NgramEdit and NgramCreate
# 2016-05-27: removed NgramEdit: replaced the local httpservice by api/ngramlists
# url(r'^lists/(?P<list_id>[0-9]+)/ngrams/create$', views.NgramCreate.as_view()), #
# 2016-07-21: removed NgramCreate: replaced the local httpservice by api/ngrams (put)
]
......@@ -162,55 +162,7 @@ class NgramList(APIView):
# 2016-03-24: refactoring, deactivated NgramEdit and NgramCreate
# 2016-05-27: removed NgramEdit: replaced the local httpservice by api/ngramlists
# ------------------------------------
#
# class NgramCreate(APIView):
# """
# Create a new Ngram in one list
# """
# renderer_classes = (JSONRenderer,)
# authentication_classes = (SessionAuthentication, BasicAuthentication)
#
# def post(self, request, list_id):
# """
# create NGram in a given list
#
# example: request.data = {'text': 'phylogeny'}
# """
# # implicit global session
# list_id = int(list_id)
# # format the ngram's text
# ngram_text = request.data.get('text', None)
# if ngram_text is None:
# raise APIException("Could not create a new Ngram without one \
# text key in the json body")
#
# ngram_text = ngram_text.strip().lower()
# ngram_text = ' '.join(ngram_text.split())
# # check if the ngram exists with the same terms
# ngram = session.query(Ngram).filter(Ngram.terms == ngram_text).first()
# if ngram is None:
# ngram = Ngram(n=len(ngram_text.split()), terms=ngram_text)
# else:
# # make sure the n value is correct
# ngram.n = len(ngram_text.split())
#
# session.add(ngram)
# session.commit()
# ngram_id = ngram.id
# # create the new node_ngram relation
# # TODO check existing Node_Ngram ?
# # £TODO ici indexation
# node_ngram = NodeNgram(node_id=list_id, ngram_id=ngram_id, weight=1.0)
# session.add(node_ngram)
# session.commit()
#
# # return the response
# return Response({
# 'uuid': ngram_id,
# 'text': ngram_text,
# 'list_id': list_id,
# })
# 2016-07-21: removed NgramCreate: replaced the local httpservice by api/ngrams (put)
class Document(APIView):
"""
......
Cycle de vie des décomptes ngrammes
-----------------------------------
### (schéma actuel et pistes) ###
Dans ce qui crée les décomptes, on peut distinguer deux niveaux ou étapes:
1. l'extraction initiale et le stockage du poids de la relation ngramme
document (appelons ces nodes "1doc")
2. tout le reste: la préparation des décomptes agrégés pour la table
termes ("stats"), et pour les tables de travail des graphes et de la
recherche de publications.
On pourrait peut-être parler d'indexation par docs pour le niveau 1 et de "modélisations" pour le niveau 2.
On peut remarquer que le niveau 1 concerne des **formes** ou ngrammes seuls (la forme observée <=> chaine de caractères u-nique après normalisation) tandis que dans le niveau 2 on a des objets plus riches... Au fur et à mesure des traitements on a finalement toujours des ngrammes mais:
- filtrés (on ne calcule pas tout sur tout)
- typés avec les listes map, stop, main (et peut-être bientôt des
"ownlistes" utilisateur)...
- groupés (ce qu'on voit avec le `+` de la table terme, et qu'on
pourrait peut-être faire apparaître aussi côté graphe?)
On peut dire qu'on manipule plutôt des **termes** au niveau 2 et non plus des **formes**... ils sont toujours des ngrammes mais enrichis par l'inclusion dans une série de mini modèles (agrégations et typologie de ngrammes guidée par les usages).
### Tables en BDD
Si on adopte cette distinction entre formes et termes, ça permet de clarifier à quel moment on doit mettre à jour ce qu'on a dans les tables. Côté structure de données, les décomptes sont toujours stockés via des n-uplets qu'on peut du coup résumer comme cela:
- **1doc**: (doc:node - forme:ngr - poids:float) dans des tables
NodeNgram
- **occs/gen/spec/tirank**: (type_mesure:node - terme:ngr -
poids:float) dans des tables NodeNgram
- **cooc**: (type_graphe:node - terme1:ngr - terme2:ngr -
poids:float) dans des tables NodeNgramNgram
- **tfidf**: (type_lienspublis:node - doc:node - terme:ngr -
correlation:float) dans des tables NodeNodeNgram.
Où "type" est le node portant la nature de la stat obtenue, ou bien la
ref du graphe pour cooc et de l'index lié à la recherche de publis pour
le tfidf.
Il y a aussi les relations qui ne contiennent pas de décomptes mais sont
essentielles pour former les décomptes des autres:
- map/main/stopliste: (type_liste:node - forme ou terme:ngr) dans des
tables NodeNgram
- "groupes": (mainform:ngr - subform:ngr) dans des tables
NodeNgramNgram.
### Scénarios d'actualisation
Alors, dans le déroulé des "scénarios utilisateurs", il y plusieurs
évenements qui viennent **modifier ces décomptes**:
1. les créations de termes opérés par l'utilisateur (ex: par
sélection/ajout dans la vue annotation)
2. les imports de termes correspondant à des formes jamais indexées sur
ce corpus
3. les dégroupements de termes opérés par l'utilisateur
4. le passage d'un terme de la stopliste aux autres listes
5. tout autre changement de listes et/ou création de nouveaux
groupes...
A et B sont les deux seules étapes hormis l'extraction initiale où des
formes sont rajoutées. Actuellement A et B sont gérés tout de suite pour
le niveau 1 (tables par doc) : il me semble qu'il est bon d'opérer la
ré-indexation des 1doc le plus tôt possible après A ou B. Pour la vue
annotations, l'utilisateur s'attend à voir apparaître le surlignage
immédiatement sur le doc visualisé. Pour l'import B, c'est pratique car
on a la liste des nouveaux termes sous la main, ça évite de la stocker
quelque part en attendant un recalcul ultérieur.
L'autre info mise à jour tout de suite pour A et B est l'appartenance
aux listes et aux groupes (pour B), qui ne demandent aucun calcul.
C, D et E n'affectent pas le niveau 1 (tables par docs) car ils ne
rajoutent pas de formes nouvelles, mais constituent des modifications
sur les listes et les groupes, et devront donc provoquer une
modification du tfidf (pour cela on doit passer par un re-calcul) et des
coocs sur map (effet appliqué à la demande d'un nouveau graphe).
C et D demandent aussi une mise à jour des stats par termes
(occurrences, gen/spec etc) puisque les éléments subforms et les
éléments de la stopliste ne figurent pas dans les stats.
Donc pour résumer on a dans tous les cas:
=> l'ajout à une liste, à un groupe et tout éventuel décompte de
nouvelle forme dans les docs sont gérés dès l'action utilisateur
=> mais les modélisations plus "avancées" représentées par les les
stats occs, gen, spec et les tables de travail "coocs sur map" et
"tfidf" doivent attendre un recalcul.
Idéalement à l'avenir il seraient tous mis à jour incrémentalement au
lieu de forcer ce recalcul... mais pour l'instant on en est là.
### Fonctions associées
| | GUI | API action → url | VIEW | SUBROUTINES |
|-------|-------------------------------------------------------|-----------------------------------------------------------------------------------------------|-------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------|
| A | "annotations/highlight.js, annotations/ngramlists.js" | "PUT → api/ngrams, PUT/DEL → api/ngramlists/change" | "ApiNgrams, ListChange" | util.toolchain.ngrams_addition.index_new_ngrams |
| B | NGrams_dyna_chart_and_table | POST/PATCH → api/ngramlists/import | CSVLists | "util.ngramlists_tools.import_ngramlists, util.ngramlists_tools.merge_ngramlists, util.toolchain.ngrams_addition.index_new_ngrams" |
| C,D,E | NGrams_dyna_chart_and_table | "PUT/DEL → api/ngramlists/change, PUT/DEL → api/ngramlists/groups" "ListChange, GroupChange" | util.toolchain.ngrams_addition.index_new_ngrams | |
L'import B a été remis en route il y a quelques semaines, et je viens de
reconnecter A dans la vue annotations.
......@@ -8,6 +8,12 @@ import random
_members = [
{ 'first_name' : 'Constance', 'last_name' : 'de Quatrebarbes',
'mail' : '4barbesATgmail.com',
'website' : 'http://c24b.github.io/',
'picture' : 'constance.jpg',
'role' : 'developer'},
{ 'first_name' : 'David', 'last_name' : 'Chavalarias',
'mail' : 'david.chavalariasATiscpif.fr',
'website' : 'http://chavalarias.com',
......
......@@ -48,28 +48,19 @@ def query_grouped_ngrams(groupings_id, details=False, scoring_metric_id=None):
Parameter:
- details: if False, send just the array of ngram_ids
if True, send triples with (ngram_id, term, scoring)
^^^^^^^
deprecated: scoring_metric_id: id of a scoring metric node (TFIDF or OCCS)
(for details and sorting)
(no more OCCS counts of subforms)
if True, send couples with (ngram_id, term)
"""
if not details:
# simple contents
query = session.query(NodeNgramNgram.ngram2_id)
else:
# detailed contents (terms and some NodeNodeNgram for score)
# detailed contents (id + terms)
query = (session
.query(
NodeNgramNgram.ngram2_id,
Ngram.terms,
# NodeNodeNgram.score #
)
.join(Ngram, NodeNgramNgram.ngram2_id == Ngram.id)
# .join(NodeNodeNgram, NodeNgramNgram.ngram2_id == NodeNodeNgram.ngram_id)
# .filter(NodeNodeNgram.node1_id == scoring_metric_id)
# .order_by(desc(NodeNodeNgram.score))
)
# main filter
......
......@@ -21,6 +21,9 @@ from gargantext.constants import DEFAULT_CSV_DELIM, DEFAULT_CSV_DELIM_GRO
from gargantext.util.toolchain.parsing import normalize_chars
from gargantext.util.toolchain.ngrams_extraction import normalize_forms
# merge will also index the new ngrams in the docs of the corpus
from gargantext.util.toolchain.ngrams_addition import index_new_ngrams
from sqlalchemy.sql import exists
from os import path
from csv import writer, reader, QUOTE_MINIMAL
......@@ -483,7 +486,7 @@ def import_ngramlists(fname, delimiter=DEFAULT_CSV_DELIM,
this_row_forms = ''
# string normalizations
this_row_label = normalize_terms(normalize_chars(this_row_label))
this_row_label = normalize_forms(normalize_chars(this_row_label))
# except:
# if i == 0:
......@@ -521,7 +524,7 @@ def import_ngramlists(fname, delimiter=DEFAULT_CSV_DELIM,
for raw_term_str in this_row_forms.split(group_delimiter):
# each subform is also like an ngram declaration
term_str = normalize_terms(normalize_chars(raw_term_str))
term_str = normalize_forms(normalize_chars(raw_term_str))
imported_unique_ngramstrs[term_str] = True
imported_nodes_ngrams[this_list_type].append(term_str)
......@@ -559,6 +562,7 @@ def import_ngramlists(fname, delimiter=DEFAULT_CSV_DELIM,
# print(new_ngrams_ids)
# print(imported_nodes_ngrams)
# ======== Import into lists =========
# 3 x abstract lists + 1 translations
......@@ -632,11 +636,8 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
seront remis dans la main à la fin)
NB: Uses group_tools.group_union() to merge the synonym links.
FIXME: new terms created at import_ngramlists() can now be added to lists
but are never added to docs
Uses ngrams_addition.index_new_ngrams() to also add new ngrams to the docs
"""
# log to send back to client-side (lines will be joined)
my_log = []
......@@ -656,6 +657,20 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
{'key': 'map', 'name':"MAPLIST"} # lid = 2
]
# ======== Index the new ngrams in the docs =========
all_possibly_new_ngram_ids = []
collect = all_possibly_new_ngram_ids.append
for lid, info in enumerate(linfos):
list_type = info['key']
if list_type in new_lists:
for ng_id in new_lists[list_type].items:
collect(ng_id)
n_added = index_new_ngrams(all_possibly_new_ngram_ids, onto_corpus)
my_log.append("MERGE: added %i new ngram occurrences in docs" % n_added)
# ======== Get the old lists =========
old_lists = {}
......
......@@ -60,7 +60,7 @@ class Parser:
print(error, 'Date not parsed for:', date_string)
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
elif hyperdata.get('publication_year', None) is not None:
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
for prefix in prefixes:
......@@ -113,7 +113,7 @@ class Parser:
hyperdata[prefix + "_hour"] = date.strftime("%H")
hyperdata[prefix + "_minute"] = date.strftime("%M")
hyperdata[prefix + "_second"] = date.strftime("%S")
print("line 116", hyperdata['publication_date'])
# print("line 116", hyperdata['publication_date'])
# finally, return the transformed result!
return hyperdata
......
......@@ -265,7 +265,8 @@ def recount(corpus):
# -> specclusion/genclusion: compute + write (=> NodeNodeNgram)
(spec_id, gen_id) = compute_specgen(corpus, cooc_matrix = coocs,
spec_overwrite_id = spec_id, gen_overwrite_id = gen_id)
spec_overwrite_id = old_spec_id,
gen_overwrite_id = old_gen_id)
print('RECOUNT #%d: [%s] updated spec-clusion node #%i' % (corpus.id, t(), spec_id))
print('RECOUNT #%d: [%s] updated gen-clusion node #%i' % (corpus.id, t(), gen_id))
......
"""
Module for raw indexing a totally new ngram
=> creates new (doc_node <-> new_ngram) relations in NodeNgram
use cases:
- from annotation view user selects a free segment of text to make a new ngram
- at list import, any new list can contain ngrams that've never been extracted
prerequisite:
- normalize_chars(new_ngram_str)
- normalize_form(new_ngram_str)
- add the new ngram to `ngrams` table
procedure:
- simple regexp search of the ngram string => addition to NodeNgram
/!\ -> morphological variants are NOT considered (ex plural or declined forms)
"""
from gargantext.models import Ngram, Node, NodeNgram
from gargantext.util.db import session, bulk_insert
from sqlalchemy import distinct
from re import findall, IGNORECASE
# TODO from gargantext.constants import LIST_OF_KEYS_TO_INDEX = title, abstract
def index_new_ngrams(ngram_ids, corpus, keys=('title', 'abstract', )):
"""
Find occurrences of some ngrams for every document of the given corpus.
+ insert them in the NodeNgram table.
@param ngram_ids: a list of ids for Ngram objects
(we assume they already went throught normalizations
and they were already added to Ngrams table
and optionally to some of the lists like MAPLIST)
(but we can't know if they were previously indexed in the corpus)
@param corpus: the CORPUS node
@param keys: the hyperdata fields to index
"""
# check the ngrams we won't process (those that were already indexed)
indexed_ngrams_subquery = (session
.query(distinct(NodeNgram.ngram_id))
.join(Node, Node.id == NodeNgram.node_id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == 'DOCUMENT')
.subquery()
)
# retrieve the ngrams from our list, filtering out the already indexed ones
todo_ngrams = (session
.query(Ngram)
.filter(Ngram.id.in_(ngram_ids))
.filter(~ Ngram.id.in_(indexed_ngrams_subquery))
.all()
)
# initialize result dict
node_ngram_to_write = {}
# loop throught the docs and their text fields
for doc in corpus.children('DOCUMENT'):
# a new empty counting subdict
node_ngram_to_write[doc.id] = {}
for key in keys:
# a text field
text = doc.hyperdata.get(key, None)
if not isinstance(text, str):
# print("WARN: doc %i has no text in field %s" % (doc.id, key))
continue
for ngram in todo_ngrams:
# build regexp : "british" => r'\bbritish\b'
ngram_re = r'\b%s\b' % ngram.terms
# --------------------------------------- find ---
n_occs = len(findall(ngram_re, text, IGNORECASE))
# -----------------------------------------------
# save the count results
if n_occs > 0:
if ngram.id not in node_ngram_to_write[doc.id]:
node_ngram_to_write[doc.id][ngram.id] = n_occs
else:
node_ngram_to_write[doc.id][ngram.id] += n_occs
# integrate all at the end
my_new_rows = []
add_new_row = my_new_rows.append
for doc_id in node_ngram_to_write:
for ngram_id in node_ngram_to_write[doc_id]:
wei = node_ngram_to_write[doc_id][ngram_id]
add_new_row([doc_id, ngram_id, wei])
del node_ngram_to_write
bulk_insert(
table = NodeNgram,
fields = ('node_id', 'ngram_id', 'weight'),
data = my_new_rows
)
n_added = len(my_new_rows)
print("index_new_ngrams: added %i new NodeNgram rows" % n_added)
return n_added
......@@ -84,7 +84,7 @@ def normalize_chars(my_str):
(autres traitements plus invasifs, comme enlever les guillemets
ou passer en lowercase, seront à placer plutôt *après* le tagger,
cf. toolchain.ngrams_extraction.normalize_terms)
cf. toolchain.ngrams_extraction.normalize_forms)
"""
# print('normalize_chars IN: "%s"' % my_str)
# --------------
......
from gargantext.util.http import ValidationException, APIView \
, get_parameters, JsonHttpResponse, Http404\
, HttpResponse
......@@ -265,64 +263,6 @@ class NodeNgramsQueries(APIView):
return CsvHttpResponse(sorted(result.items()), ('date', 'value'), 201)
# ?? TODO put in an ngrams.py file separately ?
class ApiNgrams(APIView):
def get(self, request):
# parameters retrieval and validation
startwith = request.GET.get('startwith', '').replace("'", "\\'")
# query ngrams
ParentNode = aliased(Node)
ngrams_query = (session
.query(Ngram.id, Ngram.terms, func.sum(NodeNgram.weight).label('count'))
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, Node.id == NodeNgram.node_id)
.group_by(Ngram.id, Ngram.terms)
# .group_by(Ngram)
.order_by(func.sum(NodeNgram.weight).desc(), Ngram.terms)
)
# filters
if 'startwith' in request.GET:
ngrams_query = ngrams_query.filter(Ngram.terms.startswith(request.GET['startwith']))
if 'contain' in request.GET:
print("request.GET['contain']")
print(request.GET['contain'])
ngrams_query = ngrams_query.filter(Ngram.terms.contains(request.GET['contain']))
if 'corpus_id' in request.GET:
corpus_id_list = list(map(int, request.GET.get('corpus_id', '').split(',')))
if corpus_id_list and corpus_id_list[0]:
ngrams_query = ngrams_query.filter(Node.parent_id.in_(corpus_id_list))
if 'ngram_id' in request.GET:
ngram_id_list = list(map(int, request.GET.get('ngram_id', '').split(',')))
if ngram_id_list and ngram_id_list[0]:
ngrams_query = ngrams_query.filter(Ngram.id.in_(ngram_id_list))
# pagination
offset = int(request.GET.get('offset', 0))
limit = int(request.GET.get('limit', 20))
total = ngrams_query.count()
# return formatted result
return JsonHttpResponse({
'pagination': {
'offset': offset,
'limit': limit,
'total': total,
},
'data': [
{
'id': ngram.id,
'terms': ngram.terms,
'count': ngram.count,
}
for ngram in ngrams_query[offset : offset+limit]
],
})
_operators_dict = {
"=": lambda field, value: (field == value),
"!=": lambda field, value: (field != value),
......
......@@ -614,8 +614,7 @@ class ListFamily(APIView):
mainlist_query = query_list(mainlist_id, details=True,
scoring_metric_id= scores_id)
# infos for grouped ngrams, absent from mainlist
hidden_ngrams_query = query_grouped_ngrams(groups_id, details=True,
scoring_metric_id= scores_id)
hidden_ngrams_query = query_grouped_ngrams(groups_id, details=True)
# infos for stoplist terms, absent from mainlist
stop_ngrams_query = query_list(other_list_ids['stoplist'], details=True,
......
from gargantext.util.http import ValidationException, APIView \
, get_parameters, JsonHttpResponse\
, HttpResponse
from gargantext.util.db import session, func
from gargantext.util.db_cache import cache
from gargantext.models import Node, Ngram, NodeNgram
from sqlalchemy.orm import aliased
from re import findall
# ngrams put() will implement same text cleaning procedures as toolchain
from gargantext.util.toolchain.parsing import normalize_chars
from gargantext.util.toolchain.ngrams_extraction import normalize_forms
# for indexing
from gargantext.util.toolchain.ngrams_addition import index_new_ngrams
class ApiNgrams(APIView):
def get(self, request):
"""
Used for analytics
------------------
Get ngram listing + counts in a given scope
"""
# parameters retrieval and validation
startwith = request.GET.get('startwith', '').replace("'", "\\'")
# query ngrams
ParentNode = aliased(Node)
ngrams_query = (session
.query(Ngram.id, Ngram.terms, func.sum(NodeNgram.weight).label('count'))
.join(NodeNgram, NodeNgram.ngram_id == Ngram.id)
.join(Node, Node.id == NodeNgram.node_id)
.group_by(Ngram.id, Ngram.terms)
# .group_by(Ngram)
.order_by(func.sum(NodeNgram.weight).desc(), Ngram.terms)
)
# filters
if 'startwith' in request.GET:
ngrams_query = ngrams_query.filter(Ngram.terms.startswith(request.GET['startwith']))
if 'contain' in request.GET:
print("request.GET['contain']")
print(request.GET['contain'])
ngrams_query = ngrams_query.filter(Ngram.terms.contains(request.GET['contain']))
if 'corpus_id' in request.GET:
corpus_id_list = list(map(int, request.GET.get('corpus_id', '').split(',')))
if corpus_id_list and corpus_id_list[0]:
ngrams_query = ngrams_query.filter(Node.parent_id.in_(corpus_id_list))
if 'ngram_id' in request.GET:
ngram_id_list = list(map(int, request.GET.get('ngram_id', '').split(',')))
if ngram_id_list and ngram_id_list[0]:
ngrams_query = ngrams_query.filter(Ngram.id.in_(ngram_id_list))
# pagination
offset = int(request.GET.get('offset', 0))
limit = int(request.GET.get('limit', 20))
total = ngrams_query.count()
# return formatted result
return JsonHttpResponse({
'pagination': {
'offset': offset,
'limit': limit,
'total': total,
},
'data': [
{
'id': ngram.id,
'terms': ngram.terms,
'count': ngram.count,
}
for ngram in ngrams_query[offset : offset+limit]
],
})
def put(self, request):
"""
Basic external access for *creating an ngram*
---------------------------------------------
1 - checks user authentication before any changes
2 - adds the ngram to Ngram table in DB
3 - (if corpus param is present)
adds the ngram doc counts to NodeNgram table in DB
(aka "index the ngram" throught the docs of the corpus)
4 - returns json with:
'msg' => a success msg
'text' => the initial text content
'term' => the normalized text content
'id' => the new ngram_id
'count' => the number of docs with the ngram in the corpus
(if corpus param is present)
possible inline parameters
--------------------------
@param text=<ngram_string> [required]
@param corpus=<CORPUS_ID> [optional]
"""
# 1 - check user authentication
if not request.user.is_authenticated():
res = HttpResponse("Unauthorized")
res.status_code = 401
return res
# the params
params = get_parameters(request)
print("PARAMS", [(i,v) for (i,v) in params.items()])
if 'text' in params:
original_text = str(params.pop('text'))
ngram_str = normalize_forms(normalize_chars(original_text))
else:
raise ValidationException('The route PUT /api/ngrams/ is used to create a new ngram\
It requires a "text" parameter,\
for instance /api/ngrams?text=hydrometallurgy')
# if we have a 'corpus' param (to do the indexing)...
do_indexation = False
if 'corpus' in params:
# we retrieve the corpus...
corpus_id = int(params.pop('corpus'))
corpus_node = cache.Node[corpus_id]
# and the user must also have rights on the corpus
if request.user.id == corpus_node.user_id:
do_indexation = True
else:
res = HttpResponse("Unauthorized")
res.status_code = 401
return res
# number of "words" in the ngram
ngram_size = len(findall(r' +', ngram_str)) + 1
# do the additions
try:
log_msg = ""
ngram_id = None
preexisting = session.query(Ngram).filter(Ngram.terms==ngram_str).first()
if preexisting is not None:
ngram_id = preexisting.id
log_msg += "ngram already existed (id %i)\n" % ngram_id
else:
# 2 - insert into Ngrams
new_ngram = Ngram(terms=ngram_str, n=ngram_size)
session.add(new_ngram)
session.commit()
ngram_id = new_ngram.id
log_msg += "ngram was added with new id %i\n" % ngram_id
# 3 - index the term
if do_indexation:
n_added = index_new_ngrams([ngram_id], corpus_node)
log_msg += 'ngram indexed in corpus %i\n' % corpus_id
return JsonHttpResponse({
'msg': log_msg,
'text': original_text,
'term': ngram_str,
'id' : ngram_id,
'count': n_added if do_indexation else 'no corpus provided for indexation'
}, 200)
# just in case
except Exception as e:
return JsonHttpResponse({
'msg': str(e),
'text': original_text
}, 400)
......@@ -215,6 +215,7 @@ class NodeListHaving(APIView):
class NodeResource(APIView):
# TODO either real authentification test or remove check on user.id
def get(self, request, node_id):
parameters, query, count = _query_nodes(request, node_id)
if not len(query):
......
from django.conf.urls import url
from . import nodes
from . import ngrams
from . import metrics
from . import ngramlists
from . import analytics
......@@ -10,9 +11,11 @@ urlpatterns = [ url(r'^nodes$' , nodes.NodeListResource.as_view()
, url(r'^nodes/(\d+)$' , nodes.NodeResource.as_view() )
, url(r'^nodes/(\d+)/having$' , nodes.NodeListHaving.as_view() )
# Ngrams
, url(r'^ngrams/?$' , ngrams.ApiNgrams.as_view() )
# Analytics
, url(r'^nodes/(\d+)/histories$', analytics.NodeNgramsQueries.as_view())
, url(r'^ngrams/$' , analytics.ApiNgrams.as_view() )
, url(r'hyperdata$' , analytics.ApiHyperdata.as_view() )
# get a list of ngram_ids or ngram_infos by list_id
# url(r'^ngramlists/(\d+)$', ngramlists.List.as_view()),
......
#!/bin/bash
#name:01-setup
echo :"**************CLONE**************************"
echo -n "Enter and press [ENTER]: "
read name
#ssh-agent bash -c 'ssh-add /home/christoffer/ssh_keys/theuser; git clone
#TODO clone the repo into /srv/gargantext/ and reduce the different steps
#git clone ssh://gitolite@delanoe.org:1979/gargantext /srv/gargantext \
# && cd /srv/gargantext \
# && git fetch origin stable \
# && git checkout stable \
#echo "****************SETUP**************************";
#for dir in "/srv/gargantext_lib" "/srv/gargantext_static" "/srv/gargantext_media"; do
# sudo mkdir -p $dir ;
# sudo chown gargantua:gargantua $dir ;
#done;
#here gargantext_lib.tar.bz2 inside this directory
#sudo wget http://dl.gargantext.org/gargantext_lib.tar.bz2 && \
sudo tar xvjf gargantext_lib.tar.bz2 -o /srv/gargantext_lib \
&& sudo chown -R gargantua:gargantua /srv/gargantext_lib \
&& echo ":::::::::::::::::Done::::::::::::::::::::::::::";
#!/bin/bash
#configure the base image gargamelle
#echo '****************BUILD**********************************'
#echo ':::: BUILD ::::'
sudo docker build -t gargamelle:latest ./gargamelle
#sudo docker build -t gargamelle:latest ./gargamelle
#2 option with this image:
# configure the container
# run the image with the app in it
echo '::::::::::::::::::::GARGAMELLE IMAGE BUILT:::::::::::::'
echo '*************CONFIG************************************'
echo ':::: CONFIGURE ::::'
sudo docker run \
-v /srv/:/srv/ \
-p 8000 \
-p 8000:8000 \
-p 5432 \
-it gargamelle:latest \
/bin/bash -c "/srv/gargantext/install/gargamelle/psql_configure.sh && /srv/gargantext/install/gargamelle/django_configure.sh && exit;"
/bin/bash -c "./psql_configure.sh; ./django_configure.sh ; exit"
sudo docker rm -f `docker ps -a | grep -v CONTAINER | awk '{print $1 }'`
#~ sudo docker run \
#~ -v /srv/:/srv/ \
#~ -p 8000 \
#~ -p 5432 \
#~ -it gargamelle:latest \
#~ /bin/bash -c "/srv/gargantext/install/gargamelle/django_configure.sh; exit;"
#sudo docker rm -f `docker ps -a | grep -v CONTAINER awk '{print $1 }'`
#!/bin/bash
sudo docker run \
-v /srv/:/srv/\
-p 8000 \
-p 8000:8000 \
-p 5432 \
-it gargamelle:latest \
# /bin/bash -c "service postgresql start; su gargantua -c \'source /env_3-5/bin/activate && /srv/gargantext/manage.py runserver 0.0.0.0:8000\'"
/bin/bash -c "service postgresql start; /bin/su gargantua -c 'source /env_3-5/bin/activate && /srv/gargantext/manage.py runserver 0.0.0.0:8000'"
sudo docker rm -f `docker ps -a | grep -v CONTAINER | awk '{print $1 }'`
# Install
Gargamelle is a developpement environemment to install configure and run Gargantext plateform.
## Prerequisites
* MaC or Debian based OS
* Git
* Docker engine
## Installation Procedure
* Clone the repository
If you want to contribute see our [git workflow procedure](../tools/git.md)
clone the repo and switch to your own branch named with username-branchname such as username-unstable for example
Inside the repository (install)
you will find the setup config and run procedure files
cd gargantext/install/
```
./01-setup
./02-config #here you will have to choose a username & a password for gargantext
./03-run
```
Open your browser Chrome perferably at localhost:8000
click into enter and login!
Enjoy ;)!
......@@ -5,11 +5,6 @@
# wich contains all the source code of the app
FROM debian:stretch
MAINTAINER ISCPIF <gargantext@iscpif.fr>
# Configure global ENV with deb dependencies
# Configure local ENV requirements
########################################################################
ENV DEBIAN_FRONTEND noninteractive
USER root
......@@ -84,4 +79,7 @@ RUN echo "host all all 0.0.0.0/0 md5" >> /etc/postgresql/9.5/main/pg_hba.co
RUN echo "listen_addresses='*'" >> /etc/postgresql/9.5/main/postgresql.conf
EXPOSE 5432 8000
VOLUME ["/srv/",]
......@@ -8,17 +8,21 @@
##################################################
#configure django migrations
##################################################
echo "::::: DJANGO :::::"
#echo "Starting Postgres"
#/usr/sbin/service postgresql start
/bin/su gargantua -c 'source /env_3-5/bin/activate \
&& ./srv/gargantext/manage.py makemigrations \
&& ./srv/gargantext/manage.py migrate \
&& ./srv/gargantext/dbmigrate.py \
&& ./srv/gargantext/dbmigrate.py \
&& ./srv/gargantext/dbmigrate.py;'
/bin/su gargantua -c 'source /env_3-5/bin/activate &&\
echo "Activated env" &&\
./srv/gargantext/manage.py makemigrations &&\
./srv/gargantext/manage.py migrate && \
echo "migrations ok" &&\
./srv/gargantext/dbmigrate.py && \
./srv/gargantext/dbmigrate.py && \
./srv/gargantext/dbmigrate.py && \
./srv/gargantext/manage.py createsuperuser'
/usr/sbin/service postgresql stop
......@@ -8,26 +8,17 @@
## |_| \___/|___/\__\__, |_| \___||___/
## |___/
#######################################################################
service postgresql stop
su postgres -c 'pg_dropcluster 9.5 main --stop'
echo "::::: POSTGRESQL :::::"
su postgres -c 'pg_dropcluster 9.4 main --stop'
#done in docker but redoing it
if [[ -e "/srv/gargandata" ]]; then
rm -rf /srv/gargandata/*
else
mkdir /srv/gargandata;
chown -R postgres:postgres /srv/gargandata
fi
rm -rf /srv/gargandata && mkdir /srv/gargandata && chown postgres:postgres /srv/gargandata
su postgres -c '/usr/lib/postgresql/9.5/bin/initdb -D /srv/gargandata/'
#su postgres -c '/usr/lib/postgresql/9.5/bin/pg_ctl -D /srv/gargandata/ -l journal_applicatif start'
su postgres -c '/usr/lib/postgresql/9.5/bin/pg_ctl -D /srv/gargandata/ -l journal_applicatif start'
#su postgres -c 'pg_createcluster -D /srv/gargandata 9.5 main '
#su postgres -c 'pg_ctlcluster -D /srv/gargandata 9.5 main start '
su postgres -c 'pg_createcluster -D /srv/gargandata 9.5 main '
su postgres -c 'pg_ctlcluster -D /srv/gargandata 9.5 main start '
su postgres -c 'pg_ctlcluster 9.5 main start'
service postgresql start
......
......@@ -32,5 +32,3 @@ lxml==3.5.0
requests-futures==0.9.7
bs4==0.0.1
requests==2.10.0
#testing github
#-e git://github.com/zzzeek/sqlalchemy.git@rel_1_1
......@@ -39,15 +39,16 @@
<div class="container">
<ul>
<li>
Version 3.0.0
Versions 3.*
<ul>
<li>[NAME] Blue Jasmin</li>
<li>[CODE] Refactored</li>
<li>[DATABASE] New schema</li>
<li>[INSTALL] Easy Docker</li>
</ul>
</li>
<li>
Version 2.0.0
Versions 2.*
<ul>
<li>[NAME] Red Lemon</li>
<li>[NLP] Turbo Parser, MELT</li>
......
......@@ -229,7 +229,7 @@
<p>
Gargantext
<span class="glyphicon glyphicon-registration-mark" aria-hidden="true"></span>
, version 3.0.4.2.1,
, version 3.0.4.3,
<a href="http://www.cnrs.fr" target="blank" title="Institution that enables this project.">
Copyrights
<span class="glyphicon glyphicon-copyright-mark" aria-hidden="true"></span>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment