Commit 6b1157d3 authored by delanoe's avatar delanoe

Merge branch 'testing' into stable-merge

parents 44cf24bf 01ac34f9
This diff is collapsed.
......@@ -151,12 +151,15 @@
transition: all 0.25s linear;
}
.selection {
/* this was used for the p or div that *contained* a selection */
/*.selection {
color: #aaa;
}
}*/
/* this is used for the selected text itself */
::selection {
color: black;
background-color: rgba(0, 0, 0, 0.4);
background-color: #aaa;
}
.noselection {
......
......@@ -97,6 +97,21 @@
// +propToRead+" ("+cache[propToRead]+")")
params[key] = cache[propToRead]
}
else if (typeof val == "object" && val["fromCacheIfElse"]) {
var propToReadIf = val["fromCacheIfElse"][0]
var propToReadElse = val["fromCacheIfElse"][1]
// console.log("reading from cache: response data property " +
// "if:"+propToReadIf+" ("+cache[propToReadIf]+")"+
// " else:"+propToReadElse+" ("+cache[propToReadElse]+")")
var valueIf = cache[propToReadIf]
var valueElse = cache[propToReadElse]
if (valueIf && valueIf != 'null' && valueIf != '') {
params[key] = valueIf
}
else {
params[key] = valueElse
}
}
}
// Now we run the call
......@@ -149,8 +164,8 @@
// -------------------------------------------------------------------------
// debug
// console.log("==> $rootScope <==")
// console.log($rootScope)
console.log("==> $rootScope <==")
console.log($rootScope)
});
})(window);
......@@ -15,7 +15,7 @@
{'docId': $rootScope.docId},
function(data, responseHeaders) {
$scope.authors = data.authors;
$scope.journal = data.journal;
$scope.source = data.source;
$scope.publication_date = data.publication_date;
//$scope.current_page_number = data.current_page_number;
//$scope.last_page_number = data.last_page_number;
......@@ -23,25 +23,34 @@
$rootScope.docId = data.id;
$rootScope.full_text = data.full_text;
$rootScope.abstract_text = data.abstract_text;
console.log("annotations.document.DocController.getannotations")
// GET the annotationss
NgramListHttpService.get(
{
'corpusId': $rootScope.corpusId,
'docId': $rootScope.docId
},
function(data) {
$rootScope.annotations = data[$rootScope.corpusId.toString()][$rootScope.docId.toString()];
// eg id => 'MAPLIST'
$rootScope.lists = data[$rootScope.corpusId.toString()].lists;
// inverted 'MAPLIST' => id
$rootScope.listIds = _.invert($rootScope.lists)
$scope.dataLoading = false ;
},
function(data) {
console.error("unable to get the list of ngrams");
}
);
$rootScope.workflow_finished = data.corpus_status['complete'] ;
console.log("workflow status", $rootScope.workflow_finished)
if ($scope.workflow_finished) {
console.log("annotations.document.DocController.getannotations")
// GET the annotationss
NgramListHttpService.get(
{
'corpusId': $rootScope.corpusId,
'docId': $rootScope.docId
},
function(data) {
$rootScope.annotations = data[$rootScope.corpusId.toString()][$rootScope.docId.toString()];
// eg id => 'MAPLIST'
$rootScope.lists = data[$rootScope.corpusId.toString()].lists;
// inverted 'MAPLIST' => id
$rootScope.listIds = _.invert($rootScope.lists)
$scope.dataLoading = false ;
},
function(data) {
console.error("unable to get the list of ngrams");
}
);
}
else {
$scope.dataLoading = false ;
}
});
......
This diff is collapsed.
......@@ -90,17 +90,20 @@
* MainApiAddNgramHttpService: Create and index a new ngram
* ===========================
* route: PUT api/ngrams?text=mynewngramstring&corpus=corpus_id
* ------
*
* NB it also checks if ngram exists (returns the preexisting id)
* and if it has a mainform/group (via 'testgroup' option)
* (useful if we add it to a list afterwards)
*
*/
http.factory('MainApiAddNgramHttpService', function($resource) {
return $resource(
// adding explicit "http://" b/c this a cross origin request
'http://' + window.GARG_ROOT_URL
+ "/api/ngrams?text=:ngramStr&corpus=:corpusId",
+ "/api/ngrams?text=:ngramStr&corpus=:corpusId&testgroup",
{
ngramStr: '@ngramStr',
corpusId: '@corpusId'
corpusId: '@corpusId',
},
{
put: {
......
......@@ -141,9 +141,9 @@
crudCallsToMake = [
{'service': MainApiAddNgramHttpService, 'action': 'put',
'params' : {'ngramStr':value, corpusId: $rootScope.corpusId},
'dataPropertiesToCache': ['id'] },
'dataPropertiesToCache': ['id', 'group'] },
{'service': MainApiChangeNgramHttpService, 'action': 'put',
'params' : {'listId':tgtListId, 'ngramIdList': {'fromCache': 'id'} } }
'params' : {'listId':tgtListId, 'ngramIdList': {'fromCacheIfElse': ['group','id']} } }
];
break;
......@@ -151,9 +151,9 @@
crudCallsToMake = [
{'service': MainApiAddNgramHttpService, 'action': 'put',
'params' : {'ngramStr':value, corpusId: $rootScope.corpusId},
'dataPropertiesToCache': ['id'] },
'dataPropertiesToCache': ['id', 'group'] },
{'service': MainApiChangeNgramHttpService, 'action': 'put',
'params' : {'listId':tgtListId, 'ngramIdList': {'fromCache': 'id'} } }
'params' : {'listId':tgtListId, 'ngramIdList': {'fromCacheIfElse': ['group','id']} } }
];
break;
......@@ -161,11 +161,11 @@
crudCallsToMake = [
{'service': MainApiAddNgramHttpService, 'action': 'put',
'params' : {'ngramStr':value, corpusId: $rootScope.corpusId},
'dataPropertiesToCache': ['id'] },
'dataPropertiesToCache': ['id', 'group'] },
{'service': MainApiChangeNgramHttpService, 'action': 'put',
'params' : {'listId':$rootScope.listIds.MAINLIST, 'ngramIdList': {'fromCache': 'id'} } },
'params' : {'listId':$rootScope.listIds.MAINLIST, 'ngramIdList': {'fromCacheIfElse': ['group','id']} } },
{'service': MainApiChangeNgramHttpService, 'action': 'put',
'params' : {'listId':tgtListId, 'ngramIdList': {'fromCache': 'id'} } }
'params' : {'listId':tgtListId, 'ngramIdList': {'fromCacheIfElse': ['group','id']} } }
];
break;
}
......
......@@ -24,7 +24,7 @@
<div id="annotationsApp" ng-cloak>
<div class="container-fluid">
<div class="row-fluid main-panel" ng-controller="NGramHighlightController">
<div class="col-md-4 col-xs-4 tabbable words-panel">
<div ng-if="workflow_finished" class="col-md-4 col-xs-4 tabbable words-panel">
<div class="list-selector">
<h5>Select highlighted list(s)
<select class="selectpicker" multiple ng-change="activeListsChange()" ng-model="lists" ng-controller="ActiveListsController">
......@@ -89,7 +89,7 @@
</div>
<div class="row-fluid">
<ul class="list-group clearfix">
<li class="list-group-item small"><span class="badge">journal</span>{[{journal}]}</li>
<li class="list-group-item small"><span class="badge">source</span>{[{source}]}</li>
<li class="list-group-item small"><span class="badge">authors</span>{[{authors}]}</li>
<li class="list-group-item small"><span class="badge">date</span>{[{publication_date}]}</li>
</ul>
......@@ -108,12 +108,14 @@
<span class="badge">abstract</span>
</div>
<p id="abstract-text" class="text-container">
{[{abstract_text}]}
<div ng-if="abstract_text == null" class="alert alert-info small" role="alert">Empty abstract text</div>
</p>
<div ng-if="full_text != null">
<span class="badge">full article</span>
</div>
<p id="full-text" class="text-container">
{[{full_text}]}
<div ng-if="full_text == null" class="alert alert-info small" role="alert">Empty full text</div>
</p>
</div>
......
......@@ -172,8 +172,9 @@ class Document(APIView):
def get(self, request, doc_id):
"""Document by ID"""
# implicit global session
node = session.query(Node).filter(Node.id == doc_id).first()
corpus = session.query(Node).filter(Node.id == node.parent_id).first()
corpus_workflow_status = corpus.hyperdata['statuses'][0]
if node is None:
raise APIException('This node does not exist', 404)
......@@ -185,9 +186,10 @@ class Document(APIView):
pub_date = node.hyperdata.get('publication_date')
data = {
'corpus_status': corpus_workflow_status,
'title': node.hyperdata.get('title'),
'authors': node.hyperdata.get('authors'),
'journal': node.hyperdata.get('journal'),
'source': node.hyperdata.get('source'),
'publication_date': pub_date,
'full_text': node.hyperdata.get('full_text'),
'abstract_text': node.hyperdata.get('abstract'),
......
#Gargantext
Welcome to Garagentext documentation!
List of garg's own JSON API(s) urls
===================================
2016-05-27
### /api/nodes/2
```
{
"id": 2,
"parent_id": 1,
"name": "abstract:\"evaporation+loss\"",
"typename": "CORPUS"
}
```
------------------------------
### /api/nodes?pagination_limit=-1
```
{
"records": [
{
"id": 9,
"parent_id": 2,
"name": "A recording evaporimeter",
"typename": "DOCUMENT"
},
(...)
{
"id": 119,
"parent_id": 81,
"name": "GRAPH EXPLORER COOC (in:81)",
"typename": "COOCCURRENCES"
}
],
"count": 119,
"parameters": {
"formated": "json","pagination_limit": -1,
"fields": ["id","parent_id","name","typename"],
"pagination_offset": 0
}
}
```
------------------------------
### /api/nodes?types[]=CORPUS
```
{
"records": [
{
"id": 2,
"parent_id": 1,
"name": "abstract:\"evaporation+loss\"",
"typename": "CORPUS"
},
(...)
{
"id": 8181,
"parent_id": 1,
"name": "abstract:(astrogeology+OR ((space OR spatial) AND planetary) AND geology)",
"typename": "CORPUS"
}
],
"count": 2,
"parameters": {
"pagination_limit": 10,
"types": ["CORPUS"],
"formated": "json",
"pagination_offset": 0,
"fields": ["id","parent_id","name","typename"]
}
}
```
------------------------------
### /api/nodes/5?fields[]=ngrams
<5> représente un doc_id ou list_id
```
{
"ngrams": [
[1.0,{"id":2299,"n":1,"terms":designs}],
[1.0,{"id":1917,"n":1,"terms":height}],
[1.0,{"id":1755,"n":2,"terms":higher speeds}],
[1.0,{"id":1940,"n":1,"terms":cylinders}],
[1.0,{"id":2221,"n":3,"terms":other synthesized materials}],
(...)
[2.0,{"id":1970,"n":1,"terms":storms}],
[9.0,{"id":1754,"n":2,"terms":spherical gauges}],
[1.0,{"id":1895,"n":1,"terms":direction}],
[1.0,{"id":2032,"n":1,"terms":testing}],
[1.0,{"id":1981,"n":2,"terms":"wind effects"}]
]
}
```
------------------------------
### api/nodes/3?fields[]=id&fields[]=hyperdata&fields[]=typename
```
{
"id": 3,
"typename": "DOCUMENT",
"hyperdata": {
"language_name": "English",
"language_iso3": "eng",
"language_iso2": "en",
"title": "A blabla analysis of laser treated aluminium blablabla",
"name": "A blabla analysis of laser treated aluminium blablabla",
"authors": "A K. Jain, V.N. Kulkarni, D.K. Sood"
"authorsRAW": [
{"name": "....", "affiliations": ["... Research Centre,.. 085, Country"]},
{"name": "....", "affiliations": ["... Research Centre,.. 086, Country"]}
(...)
],
"abstract": "Laser processing of materials, being a rapid melt quenching process, quite often produces a surface which is far from being ideally smooth for ion beam analysis. (...)",
"genre": ["research-article"],
"doi": "10.1016/0029-554X(81)90998-8",
"journal": "Nuclear Instruments and Methods In Physics Research",
"publication_year": "1981",
"publication_date": "1981-01-01 00:00:00",
"publication_month": "01",
"publication_day": "01",
"publication_hour": "00",
"publication_minute": "00",
"publication_second": "00",
"id": "61076EB1178A97939B1C893904C77FB7DA2276D0",
"source": "elsevier",
"distributor": "istex"
}
}
```
## TODO continuer la liste
// dot ngram_parsing_flow.dot -Tpng -o ngram_parsing_flow.png
digraph ngramflow {
edge [fontsize=10] ;
label=<<B><U>gargantext.util.toolchain</U></B><BR/>(ngram extraction flow)>;
labelloc="t" ;
"extracted_ngrams" -> "grouplist" ;
"extracted_ngrams" -> "occs+ti_rank" ;
"project stoplist (todo)" -> "stoplist" ;
"stoplist" -> "mainlist" ;
"occs+ti_rank" -> "mainlist" [label=" TI_RANK_LIMIT"];
"mainlist" -> "coocs" [label=" COOCS_THRESHOLD"] ;
"coocs" -> "specificity" ;
"specificity" -> "maplist" [label="MAPLIST_LIMIT\nMONOGRAM_PART"];
"mainlist" -> "tfidf" ;
"tfidf" -> "explore" [label="doc relations with all map and candidates"];
"maplist" -> "explore" ;
"grouplist" -> "occs+ti_rank" ;
"grouplist" -> "coocs" ;
"grouplist" -> "tfidf" ;
}
#Contribution guide
## Community
* [http://gargantext.org/about](http://gargantext.org/about)
* IRC Chat: (OFTC/FreeNode) #gargantex
##Tools
* gogs
* server access
* forge
* gargantext box
##Gargantex
* Gargantex box install
(S.I.R.= Setup Install & Run procedures)
* Architecture Overview
* Database Schema Overview
* Interface design Overview
##To do:
* Docs
* Interface deisgn
* Parsers/scrapers
* Computing
## How to contribute:
1. Clone the repo
2. Create a new branch <username>-refactoring
3. Run the gargantext-box
4. Code
5.Test
6. Commit
### Exemple1: Adding a parser
* create your new file cern.py into gargantex/scrapers/
* reference into gargantex/scrapers/urls.py
add this line:
import scrapers.cern as cern
* reference into gargantext/constants
```
# type 9
{ 'name': 'Cern',
'parser': CernParser,
'default_language': 'en',
},
```
* add an APIKEY in gargantex/settings
### Exemple2: User Interface Design
#Contribution guide
* A question or a problem? Ask the community
* Sources
* Tools
* Contribution workflow: for contributions, bugs and features
* Some examples of contributions
## Community
Need help? Ask the community
* [http://gargantext.org/about](http://gargantext.org/about)
* IRC Chat: (OFTC/FreeNode) #gargantex
## Source
Source are available throught XXX LICENSE
You can install Gargantext throught the [installation procedure](./install.md)
##Tools
* gogs
* forge.iscpif.fr
* server access
* gargantext box
## Contributing: workflow procedure
Once you have installed and tested Gargantext
You
1. Clone the stable release into your project
Note: The current stable release <release_branch> is: refactoring
Inside the repo, clone the reference branch and get the last changes:
git checkout <ref_branch>
git pull
It is highly recommended to create a generic branch on a stable release such as
git checkout -b <username>-<release_branch>
git pull
2. Create your project on stable release
git checkout -b <username>-<release_branch>-<project_name>
Do your modifications and commits as you want it:
git commit -m "foo/bar/1"
git commit -m "foo/bar/2"
git push
If you want to save your local change you can merge it into your generic branch <username>-<release_branch>
git checkout <username>-<release_branch>
git pull
git merge <username>-<release_branch>-<project_name>
git commit -m "[Merge OK] comment"
##Technical Overview
* Interface Overview
* Database Schema Overview
* Architecture Overview
### Exemple1: Adding a parser
### Exemple2: User Interface Design
Cycle de vie des décomptes ngrammes
-----------------------------------
### (schéma actuel et pistes) ###
Dans ce qui crée les décomptes, on peut distinguer deux niveaux ou étapes:
1. l'extraction initiale et le stockage du poids de la relation ngramme
document (appelons ces nodes "1doc")
2. tout le reste: la préparation des décomptes agrégés pour la table
termes ("stats"), et pour les tables de travail des graphes et de la
recherche de publications.
On pourrait peut-être parler d'indexation par docs pour le niveau 1 et de "modélisations" pour le niveau 2.
On peut remarquer que le niveau 1 concerne des **formes** ou ngrammes seuls (la forme observée <=> chaine de caractères u-nique après normalisation) tandis que dans le niveau 2 on a des objets plus riches... Au fur et à mesure des traitements on a finalement toujours des ngrammes mais:
- filtrés (on ne calcule pas tout sur tout)
- typés avec les listes map, stop, main (et peut-être bientôt des
"ownlistes" utilisateur)...
- groupés (ce qu'on voit avec le `+` de la table terme, et qu'on
pourrait peut-être faire apparaître aussi côté graphe?)
On peut dire qu'on manipule plutôt des **termes** au niveau 2 et non plus des **formes**... ils sont toujours des ngrammes mais enrichis par l'inclusion dans une série de mini modèles (agrégations et typologie de ngrammes guidée par les usages).
### Tables en BDD
Si on adopte cette distinction entre formes et termes, ça permet de clarifier à quel moment on doit mettre à jour ce qu'on a dans les tables. Côté structure de données, les décomptes sont toujours stockés via des n-uplets qu'on peut du coup résumer comme cela:
- **1doc**: (doc:node - forme:ngr - poids:float) dans des tables
NodeNgram
- **occs/gen/spec/tirank**: (type_mesure:node - terme:ngr -
poids:float) dans des tables NodeNgram
- **cooc**: (type_graphe:node - terme1:ngr - terme2:ngr -
poids:float) dans des tables NodeNgramNgram
- **tfidf**: (type_lienspublis:node - doc:node - terme:ngr -
correlation:float) dans des tables NodeNodeNgram.
Où "type" est le node portant la nature de la stat obtenue, ou bien la
ref du graphe pour cooc et de l'index lié à la recherche de publis pour
le tfidf.
Il y a aussi les relations qui ne contiennent pas de décomptes mais sont
essentielles pour former les décomptes des autres:
- map/main/stopliste: (type_liste:node - forme ou terme:ngr) dans des
tables NodeNgram
- "groupes": (mainform:ngr - subform:ngr) dans des tables
NodeNgramNgram.
### Scénarios d'actualisation
Alors, dans le déroulé des "scénarios utilisateurs", il y plusieurs
évenements qui viennent **modifier ces décomptes**:
1. les créations de termes opérés par l'utilisateur (ex: par
sélection/ajout dans la vue annotation)
2. les imports de termes correspondant à des formes jamais indexées sur
ce corpus
3. les dégroupements de termes opérés par l'utilisateur
4. le passage d'un terme de la stopliste aux autres listes
5. tout autre changement de listes et/ou création de nouveaux
groupes...
A et B sont les deux seules étapes hormis l'extraction initiale où des
formes sont rajoutées. Actuellement A et B sont gérés tout de suite pour
le niveau 1 (tables par doc) : il me semble qu'il est bon d'opérer la
ré-indexation des 1doc le plus tôt possible après A ou B. Pour la vue
annotations, l'utilisateur s'attend à voir apparaître le surlignage
immédiatement sur le doc visualisé. Pour l'import B, c'est pratique car
on a la liste des nouveaux termes sous la main, ça évite de la stocker
quelque part en attendant un recalcul ultérieur.
L'autre info mise à jour tout de suite pour A et B est l'appartenance
aux listes et aux groupes (pour B), qui ne demandent aucun calcul.
C, D et E n'affectent pas le niveau 1 (tables par docs) car ils ne
rajoutent pas de formes nouvelles, mais constituent des modifications
sur les listes et les groupes, et devront donc provoquer une
modification du tfidf (pour cela on doit passer par un re-calcul) et des
coocs sur map (effet appliqué à la demande d'un nouveau graphe).
C et D demandent aussi une mise à jour des stats par termes
(occurrences, gen/spec etc) puisque les éléments subforms et les
éléments de la stopliste ne figurent pas dans les stats.
Donc pour résumer on a dans tous les cas:
=> l'ajout à une liste, à un groupe et tout éventuel décompte de
nouvelle forme dans les docs sont gérés dès l'action utilisateur
=> mais les modélisations plus "avancées" représentées par les les
stats occs, gen, spec et les tables de travail "coocs sur map" et
"tfidf" doivent attendre un recalcul.
Idéalement à l'avenir il seraient tous mis à jour incrémentalement au
lieu de forcer ce recalcul... mais pour l'instant on en est là.
### Fonctions associées
| | GUI | API action → url | VIEW | SUBROUTINES |
|-------|-------------------------------------------------------|-----------------------------------------------------------------------------------------------|-------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------|
| A | "annotations/highlight.js, annotations/ngramlists.js" | "PUT → api/ngrams, PUT/DEL → api/ngramlists/change" | "ApiNgrams, ListChange" | util.toolchain.ngrams_addition.index_new_ngrams |
| B | NGrams_dyna_chart_and_table | POST/PATCH → api/ngramlists/import | CSVLists | "util.ngramlists_tools.import_ngramlists, util.ngramlists_tools.merge_ngramlists, util.toolchain.ngrams_addition.index_new_ngrams" |
| C,D,E | NGrams_dyna_chart_and_table | "PUT/DEL → api/ngramlists/change, PUT/DEL → api/ngramlists/groups" "ListChange, GroupChange" | util.toolchain.ngrams_addition.index_new_ngrams | |
L'import B a été remis en route il y a quelques semaines, et je viens de
reconnecter A dans la vue annotations.
#Contribution guide
## Community
* [http://gargantext.org/about](http://gargantext.org/about)
* IRC Chat: (OFTC/FreeNode) #gargantex
##Tools
* gogs
* server access
* gargantext box
##Gargantex
* Gargantex box install
see [install procedure](install.md)
* Architecture Overview
* Database Schema Overview
* Interface design Overview
##To do:
* Docs
* Interface design
* [Parsers](./overview/parser.md) / scrappers(./overview/scraper.md)
* Computing
## How to contribute:
1. Clone the repo
2. Create a new branch <username>-refactoring
3. Run the gargantext-box
4. Code
5. Test
6. Commit
94eb7bdf57557b72dcd1b93a42af044b pubmed.zip
# API
Be more careful about authorizations.
cf. "ng-resource".
# Projects
## Overview of all projects
- re-implement deletion
## Single project view
- re-implement deletion
# Taggers
Path for data used by taggers should be defined in `gargantext.constants`.
# Database
# Sharing
Here follows a brief description of how sharing could be implemented.
## Database representation
The database representation of sharing can be distributed among 4 tables:
- `persons`, of which items represent either a user or a group
- `relationships` describes the relationships between persons (affiliation
of a user to a group, contact between two users, etc.)
- `nodes` contains the projects, corpora, documents, etc. to share (they shall
inherit the sharing properties from their parents)
- `permissions` stores the relations existing between the three previously
described above: it only consists of 2 foreign keys, plus an integer
between 1 and 3 representing the level of sharing and the start date
(when the sharing has been set) and the end date (when necessary, the time
at which sharing has been removed, `NULL` otherwise)
## Python code
The permission levels should be set in `gargantext.constants`, and defined as:
```python
PERMISSION_NONE = 0 # 0b0000
PERMISSION_READ = 1 # 0b0001
PERMISSION_WRITE = 3 # 0b0011
PERMISSION_OWNER = 7 # 0b0111
```
The requests to check for permissions (or add new ones) should not be rewritten
every time. They should be "hidden" within the models:
- `Person.owns(node)` returns a boolean
- `Person.can_read(node)` returns a boolean
- `Person.can_write(node)` returns a boolean
- `Person.give_right(node, permission)` gives a right to a given user
- `Person.remove_right(node, permission)` removes a right from a given user
- `Person.get_nodes(permission[, type])` returns an iterator on the list of
nodes on which the person has at least the given permission (optional
argument: type of requested node)
- `Node.get_persons(permission[, type])` returns an iterator on the list of
users who have at least the given permission on the node (optional argument:
type of requested persons, such as `USER` or `GROUP`)
## Example
Let's imagine the `persons` table contains the following data:
| id | type | username |
|----|-------|-----------|
| 1 | USER | David |
| 2 | GROUP | C.N.R.S. |
| 3 | USER | Alexandre |
| 4 | USER | Untel |
| 5 | GROUP | I.S.C. |
| 6 | USER | Bidule |
Assume "David" owns the groups "C.N.R.S." and "I.S.C.", "Alexandre" belongs to
the group "I.S.C.", with "Untel" and "Bidule" belonging to the group "C.N.R.S.".
"Alexandre" and "David" are in contact.
The `relationships` table then contains:
| person1_id | person2_id | type |
|------------|------------|---------|
| 1 | 2 | OWNER |
| 1 | 5 | OWNER |
| 3 | 2 | MEMBER |
| 4 | 5 | MEMBER |
| 6 | 5 | MEMBER |
| 1 | 3 | CONTACT |
The `nodes` table is populated as such:
| id | type | name |
|----|----------|----------------------|
| 12 | PROJECT | My super project |
| 13 | CORPUS | A given corpus |
| 13 | CORPUS | The corpus |
| 14 | DOCUMENT | Some document |
| 15 | DOCUMENT | Another document |
| 16 | DOCUMENT | Yet another document |
| 17 | DOCUMENT | Last document |
| 18 | PROJECT | Another project |
| 19 | PROJECT | That project |
If we want to express that "David" created "My super project" (and its children)
and wants everyone in "C.N.R.S." to be able to view it, but not access it,
`permissions` should contain:
| person_id | node_id | permission |
|-----------|---------|------------|
| 1 | 12 | OWNER |
| 2 | 12 | READ |
If "David" also wanted "Alexandre" (and no one else) to view and modify "The
corpus" (and its children), we would have:
| person_id | node_id | permission |
|-----------|---------|------------|
| 1 | 12 | OWNER |
| 2 | 12 | READ |
| 3 | 13 | WRITE |
If "Alexandre" created "That project" and wants "Bidule" (and no one else) to be
able to view and modify it (and its children), the table should then have:
| person_id | node_id | permission |
|-----------|---------|------------|
| 3 | 19 | OWNER |
| 6 | 19 | WRITE |
#User guide
1. Login
run the gargantex box following the install procedure
open a webrowser at http://127.0.0.1:8000/
click on Test Gargantext
login with:
```
Login : gargantua
Password : autnagrag
```
2. Create a project
3. Import an existing corpus
4. Create corpus from search
5. Explore stats
6. Explore graphs
7. Query
8. Refine
* Time periods
* Nodes
9. Export
#Architecture Overview
#Database Schema
#Website
Gargantext is a web plateform to explore your corpora using text-mining[...](about.md)
## Getting started
* [Install](install.md) the Gargantext box
* [Take a tour](demo.md) of the different features offered by Gargantext
##Need some help?
Ask the community at:
* [http://gargantext.org/about](http://gargantext.org/about)
* IRC Chat: (OFTC/FreeNode) #gargantex
##Want to contribute?
* take a look at the [architecture overview](overview.md)
* read the [contribution guide](contribution-guide.md)
## News
## Credits and acknowledgments
#Install Instructions for Gargamelle:
Gargamelle is the gargantext plateforme toolbox it is a full plateform system
with minimal modules
First you need to get the source code to install it
The folder will be /srv/gargantext:
* docs containes all informations on gargantext
/srv/gargantext/docs/
* install contains all the installation files
/srv/gargantext/install/
Help needed ?
See [http://gargantext.org/about](http://gargantext.org/about) and [tools](./contribution_guide.md) for the community
## Get the source code
by cloning gargantext into /srv/gargantext
``` bash
git clone ssh://gitolite@delanoe.org:1979/gargantext /srv/gargantext \
&& cd /srv/gargantext \
&& git fetch origin stable \
&& git checkout stable \
```
## Install
```bash
# go into the directory
user@computer: cd /srv/gargantext/
#git inside installation folder
user@computer: cd /install
#execute the installation
user@computer: ./install
```
The installation requires to create a user for gargantext, it will be asked:
```bash
Username (leave blank to use 'gargantua'):
#email is not mandatory
Email address:
Password:
Password (again):
```
If successfully done this step you should see:
```bash
Superuser created successfully.
[ ok ] Stopping PostgreSQL 9.5 database server: main.
```
## Run
Once you proceed to installation Gargantext plateforme will be available at localhost:8000
to start gargantext plateform:
``` bash
# go into the directory
user@computer: cd /srv/gargantext/
#git inside installation folder
user@computer: ./start
#type ctrl+d to exit or simply type exit in terminal;
```
Then open up a chromium browser and go to localhost:8000
Click on "Enter Gargantext"
Login in with you created username and pasword
Enjoy! ;)
* Create user gargantua
Main user of Gargantext is Gargantua (role of Pantagruel soon)!
``` bash
sudo adduser --disabled-password --gecos "" gargantua
```
* Create the directories you need
here for the example gargantext package will be installed in /srv/
``` bash
for dir in "/srv/gargantext"
"/srv/gargantext_lib"
"/srv/gargantext_static"
"/srv/gargantext_media"
"/srv/env_3-5"; do
sudo mkdir -p $dir ;
sudo chown gargantua:gargantua $dir ;
done
```
You should see:
```bash
$tree /srv
/srv
├── gargantext
├── gargantext_lib
├── gargantext_media
│   └── srv
│   └── env_3-5
└── gargantext_static
```
* Get the main libraries
Download uncompress and make main user access to it.
PLease, Be patient due to the size of the packages libraries (27GO)
this step can be long....
``` bash
wget http://dl.gargantext.org/gargantext_lib.tar.bz2 \
&& tar xvjf gargantext_lib.tar.bz2 -o /srv/gargantext_lib \
&& sudo chown -R gargantua:gargantua /srv/gargantext_lib \
&& echo "Libs installed"
```
* Get the source code of Gargantext
by cloning the repository of gargantext
``` bash
git clone ssh://gitolite@delanoe.org:1979/gargantext /srv/gargantext \
&& cd /srv/gargantext \
&& git fetch origin refactoring \
&& git checkout refactoring \
```
TODO(soon): git clone https://gogs.iscpif.fr/gargantext.git
See the [next steps of installation procedure](install.md#Install)
#Architecture Overview
#Database Schema
#Website
# HOW TO: Reference a new webscrapper/API + parser
## Global scope
Three main mooves to do:
- develop and index parser
in gargantext.util.parsers
- developp and index a scrapper
in gargantext.moissonneurs
- adapt forms for a new source
in templates and views
## Reference parser into gargantext website
gargantext website is stored in gargantext/gargantext
### reference your new parser into contants.py
* import your parser l.125
```
from gargantext.util.parsers import \
EuropressParser, RISParser, PubmedParser, ISIParser, CSVParser, ISTexParser, CernParser
```
The parser corresponds to the name of the parser referenced in gargantext/util/parser
here name is CernParser
* index your RESOURCETYPE
int RESOURCETYPES (l.145) **at the end of the list**
```
# type 10
{ "name": 'SCOAP (XML MARC21 Format)',
"parser": CernParser,
"default_language": "en",
'accepted_formats':["zip","xml"],
},
```
A noter le nom ici est composé de l'API_name(SCOAP) + (GENERICFILETYPE FORMAT_XML Format)
La complexité du nommage correspond à trois choses:
* le nom de l'API (different de l'organisme de production)
* le type de format: XML
* la norme XML de ce format : MARC21 (cf. CernParser in gargantext/util/parser/Cern.py )
The default_langage corresponds to the default accepted lang that **should load** the default corresponding tagger
```
from gargantext.util.taggers import NltkTagger
```
TO DO: charger à la demander les types de taggers en fonction des langues et de l'install
TO DO: proposer un module pour télécharger des parsers supplémentaires
TO DO: provide install tagger module scripts inside lib
Les formats correspondent aux types de fichiers acceptées lors de l'envoi du fichier dans le formulaire de
parsing disponible dans `gargantext/view/pages/projects.py` et
exposé dans `/templates/pages/projects/project.html`
## reference your parser script
## add your parser script into folder gargantext/util/parser/
here my filename was Cern.py
##declare it into gargantext/util/parser/__init__.py
from .Cern import CernParser
At this step, you will be able to see your parser and add a file with the form
but nothing will occur
## the good way to write the scrapper script
Three main and only requirements:
* your parser class should inherit from the base class _Parser()
`gargantext/gargantext/util/parser/_Parser`
* your parser class must have a parse method that take a **file buffer** as input
* you parser must structure and store data into **hyperdata_list** variable name
to be properly indexed by toolchain
! Be careful of date format: provide a publication_date in a string format YYYY-mm-dd HH:MM:SS
# Adding a scrapper API to offer search option:
En cours
* Add pop up question Do you have a corpus
option search in /templates/pages/projects/project.html line 181
## Reference a scrapper (moissonneur) into gargantext
* adding accepted_formats in constants
* adding check_file routine in Form check ==> but should inherit from utils/files.py
that also have implmented the size upload limit check
# Suggestion 4 next steps:
* XML parser MARC21 UNIMARC ...
* A project type is qualified by the first element add i.e:
the first element determine the type of corpus of all the corpora within the project
#resources
Adding a new source into Gargantext requires a previous declaration
of the source inside constants.py
```python
RESOURCETYPES= [
{ "type":9, #give a unique type int
"name": 'SCOAP [XML]', #resource name as proposed into the add corpus FORM [generic format]
"parser": "CernParser", #name of the new parser class inside a CERN.py file (set to None if not implemented)
"format": 'MARC21', #specific format
'file_formats':["zip","xml"],# accepted file format
"crawler": "CernCrawler", #name of the new crawler class inside a CERN.py file (set to None if no Crawler implemented)
'default_languages': ['en', 'fr'], #supported defaut languages of the source
},
...
]
```
## adding a new parser
Once you declared your new parser inside constants.py
add your new crawler file into /srv/gargantext/utils/parsers/
following this naming convention:
* Filename must be in uppercase without the Crawler mention.
eg. MailParser => MAIL.py
* Inside this file the Parser must be called following the exact typo declared as parser in constants.py
* Your new crawler shall inherit from baseclasse Parser and provide a parse(filebuffer) method
```python
#!/usr/bin/python3 env
#filename:/srv/gargantext/util/parser/MAIL.py:
from ._Parser import Parser
class MailParser(Parser):
def parse(self, file):
...
```
## adding a new crawler
Once you declared your new parser inside constants.py
add your new crawler file into /srv/gargantext/utils/parsers/
following this naming convention:
* Filename must be in uppercase without the Crawler mention.
eg. MailCrawler => MAIL.py
* Inside this file the Crawler must be called following the exact typo declared as crawler in constants.py
* Your new crawler shall inherit from baseclasse Crawler and provide three method:
* scan_results => ids
* sample = > yes/no
* fetch
```python
#!/usr/bin/python3 env
#filename:/srv/gargantext/util/crawler/MAIL.py:
from ._Crawler import Crawler
class MailCrawler(Crawler):
def scan_results(self, query):
...
self.ids = set()
def sample(self, results_nb):
...
def fetch(self, ids):
```
// dot ngram_parsing_flow.dot -Tpng -o ngram_parsing_flow.png
digraph ngramflow {
edge [fontsize=10] ;
label=<<B><U>gargantext.util.toolchain</U></B><BR/>(ngram extraction flow)>;
labelloc="t" ;
"extracted_ngrams" -> "grouplist" ;
"extracted_ngrams" -> "occs+tfidfs" ;
"main_user_stoplist" -> "stoplist" ;
"stoplist" -> "mainlist" ;
"occs+tfidfs" -> "mainlist" [label=" TFIDF_LIMIT"];
"mainlist" -> "coocs" [label=" COOCS_THRESHOLD"] ;
"coocs" -> "specificity" ;
"specificity" -> "maplist" [label="MAPLIST_LIMIT\nMONOGRAM_PART"];
"maplist" -> "explore" ;
"grouplist" -> "maplist" ;
}
"""
# WARNING: to ensure consistency and retrocompatibility, lists should keep the
# initial order (ie., new elements should be appended at the end of the lists)
abstract:
---------
something between global params, constants,
configuration variables, ini file...
contents:
---------
+ db constants/ontology
- nodetypes
(db int <=> named types <=> python code)
+ input low-level limits
- query size
- max upload size
- doc parsing batch size
- word extraction batch size
+ process config
- resourcetypes config (~ input ontology)
- wordlist generation params
- graph creation params
- £TODO sequence of transformations "custom pipeline"
+ input process subclasses/subroutines
- crawling, import
- tagger services and functions
- parser services
- stemmer services
"""
import os
import re
import importlib
from gargantext.util.lists import *
from gargantext.util.tools import datetime, convert_to_date
import re
from .settings import BASE_DIR
# types & models (nodes, lists, hyperdata, resource) ---------------------------------------------
LISTTYPES = {
......@@ -119,10 +155,8 @@ INDEXED_HYPERDATA = {
# resources ---------------------------------------------
def get_resource(sourcetype):
'''resource :: type => resource dict'''
for n in RESOURCETYPES:
if int(n["type"]) == int(sourcetype):
return n
return None
return RESOURCETYPES[sourcetype-1]
def get_resource_by_name(sourcename):
'''resource :: name => resource dict'''
for n in RESOURCETYPES:
......@@ -332,8 +366,6 @@ DEFAULT_CSV_DELIM_GROUP = '|&|'
# Files ----------------------------------------------------------------
import os
from .settings import BASE_DIR
# uploads/.gitignore prevents corpora indexing
# copora can be either a folder or symlink towards specific partition
UPLOAD_DIRECTORY = os.path.join(BASE_DIR, 'uploads/corpora')
......@@ -350,6 +382,9 @@ BATCH_NGRAMSEXTRACTION_SIZE = 3000 # how many new node-ngram relations before
QUERY_SIZE_N_MAX = 1000
QUERY_SIZE_N_DEFAULT = 1000
# Refresh corpora workflow status for project view's progressbar
PROJECT_VIEW_REFRESH_INTERVAL = 3000 # 1st refresh in ms (then increasing arithmetically)
PROJECT_VIEW_MAX_REFRESH_ATTEMPTS = 10 # how many times before we give up
# ------------------------------------------------------------------------------
# Graph <=> nodes API parameters
......@@ -360,7 +395,7 @@ DEFAULT_N_DOCS_HAVING_NGRAM = 5
# Graph constraints to compute the graph:
# Modes: live graph generation, graph asynchronously computed or errors detected
# here are the maximum size of corpus and maplist required to compute the graph
graph_constraints = {'corpusMax' : 599
graph_constraints = {'corpusMax' : 100
,'corpusMin' : 40
,'mapList' : 50
}
......@@ -40,7 +40,7 @@ CELERY_ACCEPT_CONTENT = ['pickle', 'json', 'msgpack', 'yaml']
CELERY_IMPORTS = (
"gargantext.util.toolchain",
"gargantext.util.crawlers",
"graph.cooccurrences",
"graph.graph",
"moissonneurs.pubmed",
"moissonneurs.istex",
)
......
......@@ -10,7 +10,7 @@ for resource in RESOURCETYPES:
try:
name =resource["crawler"]
#crawler is type basename+"Crawler"
filename = name.replace("Crawler", "").lower()
filename = name.replace("Crawler", "").upper()
module = base_parser+".%s" %(filename)
importlib.import_module(module)
......
......@@ -106,6 +106,11 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None, do_stat
mustcommit = False
# create temporary table with given data
sql_columns = 'id INTEGER'
cursor.execute('BEGIN WORK;')
cursor.execute('LOCK TABLE %s IN SHARE ROW EXCLUSIVE MODE;' % model.__tablename__)
for field in fields:
column = getattr(model, field)
sql_columns += ', %s %s' % (field, column.type, )
......@@ -160,3 +165,6 @@ def bulk_insert_ifnotexists(model, uniquekey, fields, data, cursor=None, do_stat
return result, n_new
else:
return result
cursor.execute('COMMIT WORK;')
......@@ -46,6 +46,9 @@ class ModelCache(dict):
class Cache:
def __getattr__(self, key):
'''
lazy init of new modelcaches: self.Node, self.User...
'''
try:
model = getattr(models, key)
except AttributeError:
......@@ -54,4 +57,15 @@ class Cache:
setattr(self, key, modelcache)
return modelcache
def clean_all(self):
'''
re-init any existing modelcaches
'''
for modelname in self.__dict__:
old_modelcache = getattr(cache, modelname)
new_modelcache = ModelCache(old_modelcache._model)
del old_modelcache
setattr(cache, modelname, new_modelcache)
cache = Cache()
......@@ -32,6 +32,10 @@ def requires_auth(func):
from gargantext.util.db import session
session.rollback()
print("=== session rollback ok!")
# re init the global cache (it must still have detached instances)
from gargantext.util.db_cache import cache
cache.clean_all()
print("=== cache reinit ok!")
# and relogin for safety
url = '/auth/login/?next=%s' % urlencode(request.path)
return redirect(url)
......
......@@ -462,6 +462,9 @@ def import_ngramlists(fname, delimiter=DEFAULT_CSV_DELIM,
for j, colname in enumerate(csv_row):
if colname in ['label', 'status', 'forms']:
columns[colname] = j
# skip empty columns
elif match(r'^\s*$',colname):
pass
else:
raise ValueError('Wrong header "%s" on line %i (only possible headers are "label", "forms" and "status")' % (colname, n_read_lines))
if 'label' not in columns:
......@@ -548,7 +551,9 @@ def import_ngramlists(fname, delimiter=DEFAULT_CSV_DELIM,
imported_ngrams_dbdata.append((ngram_str, n_words))
# returns a dict {term => id} and a count of inserted ones
# -------------------------
(new_ngrams_ids, n_added_ng) = bulk_insert_ifnotexists(
# -------------------------
model = Ngram,
uniquekey = 'terms',
fields = ('terms', 'n'),
......@@ -612,7 +617,7 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
- resolves conflicts if terms belong in different lists
> map wins over both other types
> main wins over stop
> stop never wins
> stop never wins £TODO STOP wins over candidates from main
@param new_lists: a dict of *new* imported lists with format:
{'stop': UnweightedList,
......@@ -667,7 +672,10 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
for ng_id in new_lists[list_type].items:
collect(ng_id)
from gargantext.util.toolchain.main import t
print("MERGE DEBUG: starting index_new_ngrams", t())
n_added = index_new_ngrams(all_possibly_new_ngram_ids, onto_corpus)
print("MERGE DEBUG: finished index_new_ngrams", t())
my_log.append("MERGE: added %i new ngram occurrences in docs" % n_added)
......@@ -677,7 +685,7 @@ def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
# DB nodes stored with same indices 0,1,2 (resp. stop, miam and map)
# find target ids of the list node objects
tgt_nodeids = [
onto_corpus.children("STOPLIST").first().id,
onto_corpus.children("STOPLIST").first().id, # £todo via parent project?
onto_corpus.children("MAINLIST").first().id,
onto_corpus.children("MAPLIST").first().id
]
......
......@@ -24,7 +24,7 @@ class CernParser(Parser):
"773":{
"c": "pages",
"n": "issue",
"p": "journal",
"p": "source",
"v": "volume",
"y": "publication_year"
},
......
......@@ -159,11 +159,11 @@ class EuropresseParser(Parser):
try:
pub_name = html_article.xpath(name_xpath)[0].text
name = pub_name.split(', ')
hyperdata['journal'] = name[0]
hyperdata['number'] = name[1]
hyperdata['source'] = name[0]
hyperdata['number'] = name[1]
except:
try:
hyperdata['journal'] = pub_name.strip()
hyperdata['source'] = pub_name.strip()
except:
pass
......
......@@ -10,7 +10,7 @@ class ISIParser(RISParser):
"TI": {"type": "hyperdata", "key": "title", "separator": " "},
"AU": {"type": "hyperdata", "key": "authors", "separator": ", "},
"DI": {"type": "hyperdata", "key": "doi"},
"SO": {"type": "hyperdata", "key": "journal"},
"SO": {"type": "hyperdata", "key": "source"},
"PY": {"type": "hyperdata", "key": "publication_year"},
"PD": {"type": "hyperdata", "key": "publication_month"},
"LA": {"type": "hyperdata", "key": "language_fullname"},
......
......@@ -64,7 +64,7 @@ class ISTexParser(Parser):
hyperdata["publication_date"].append( hyperdata["host"]["pubdate"] )
if "title" in hyperdata["host"]:
hyperdata["journal"] = hyperdata["host"]["title"]
hyperdata["source"] = hyperdata["host"]["title"]
authors=False
if "authorsRAW" in hyperdata:
......
......@@ -7,7 +7,7 @@ from io import BytesIO
class PubmedParser(Parser):
hyperdata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title',
"source" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle',
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText',
"title2" : 'MedlineCitation/Article/VernacularTitle',
......
......@@ -19,7 +19,7 @@ class RepecParser(Parser):
b"T1": {"type": "hyperdata", "key": "title", "separator": " "},
b"ST": {"type": "hyperdata", "key": "subtitle", "separator": " "},
b"A1": {"type": "hyperdata", "key": "authors", "separator": "\n"},
b"JO": {"type": "hyperdata", "key": "journal"},
b"JO": {"type": "hyperdata", "key": "source"},
b"UR": {"type": "hyperdata", "key": "doi"},
b"Y1": {"type": "hyperdata", "key": "publication_year"},
b"PD": {"type": "hyperdata", "key": "publication_month"},
......
......@@ -15,8 +15,8 @@ class RISParser(Parser):
"ST": {"type": "hyperdata", "key": "subtitle", "separator": " "},
"AU": {"type": "hyperdata", "key": "authors", "separator": "\n"},
"JO": {"type": "hyperdata", "key": "journal"},
"T2": {"type": "hyperdata", "key": "journal"},
"JO": {"type": "hyperdata", "key": "source"},
"T2": {"type": "hyperdata", "key": "source"},
# "T2": variant of JO (if together only last will be kept)
"UR": {"type": "hyperdata", "key": "doi"},
......
#!/bin/bash
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/srv/gargantext_lib/taggers/nlpserver/TurboParser/deps/local/lib:"
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/srv/gargantext_lib/taggers/nlpserver:/srv/gargantext_lib/taggers/nlpserver/TurboParser/deps/local/lib:"
if [[ ! "$VIRTUAL_ENV" ]]
then
......
......@@ -27,7 +27,6 @@ def is_stop_word(ngram, stop_words=None):
# , "(.*)(\.)(.*)" trop fort (enlève les sigles !)
, "(.*)(\,)(.*)"
, "(.*)(< ?/?p ?>)(.*)" # marques de paragraphes
, "(.*)(study|elsevier)(.*)"
, "(.*)\b(xx|xi|xv)\b(.*)"
, "(.*)(result)(.*)"
, "(.*)(year|année|nombre|moitié)(.*)"
......@@ -37,6 +36,87 @@ def is_stop_word(ngram, stop_words=None):
, "(.*)(terme)(.*)"
, "(.*)(différent)(.*)"
, "(.*)(travers)(.*)"
# academic stamps
, ".*\belsevier\b.*"
, ".*\bwiley\b.*)"
, ".*\bspringer\b.*"
, ".*university press\b.*"
, ".*\bstudy\b.*"
# academic terms when alone ~~> usually not informative
, "hypothes[ie]s$"
, "analys[ie]s$"
, "bas[ie]s$"
, "online$"
, "importance$"
, "uses?$"
, "cases?$"
, "effects?$"
, "times?$"
, "methods?$"
, "types?$"
, "evidences?$"
, "findings?$"
, "relations?$"
, "terms?$"
, "procedures?$"
, "factors?$"
, "reports?$"
, "changes?$"
, "facts?$"
, "others?$"
, "applications?$"
, "periods?$"
, "investigations?$"
, "orders?$"
, "forms?$"
, "conditions?$"
, "situations?$"
, "papers?$"
, "relationships?$"
, "values?$"
, "areas?$"
, "techniques?$"
, "means?$"
, "conclusions?$"
, "comparisons?$"
, "parts?$"
, "amounts?$"
, "aims?$"
, "lacks?$"
, "issues?$"
, "ways?$"
, "ranges?$"
, "models?$"
, "articles?$"
, "series?$"
, "totals?$"
, "influences?$"
, "journals?$"
, "rules?$"
, "persons?$"
, "abstracts?$"
, "(?:book)? reviews?$"
, "process(?:es)?$"
, "approach(?:es)?$"
, "theor(?:y|ies)?$"
, "methodolog(?:y|ies)?$"
, "similarit(?:y|ies)?$"
, "possibilit(?:y|ies)?$"
, "stud(?:y|ies)?$"
# non-thematic or non-NP expressions
, "none$"
, "other(?: hand)?$"
, "whereas$"
, "usually$"
, "and$"
# , "vol$"
, "eds?$"
, "ltd$"
, "copyright$"
, "e-?mails?$"
, ".*="
, "=.*"
, "further(?:more)?$"
, "(.*)(:|\|)(.*)"
] :
compiled_regexes.append(compile(regex))
......
......@@ -19,9 +19,12 @@ procedure:
from gargantext.models import Ngram, Node, NodeNgram
from gargantext.util.db import session, bulk_insert
from gargantext.util.db import bulk_insert_ifnotexists # £TODO debug
from sqlalchemy import distinct
from re import findall, IGNORECASE
from gargantext.util.toolchain.main import t # timer
# TODO from gargantext.constants import LIST_OF_KEYS_TO_INDEX = title, abstract
def index_new_ngrams(ngram_ids, corpus, keys=('title', 'abstract', )):
......@@ -39,22 +42,17 @@ def index_new_ngrams(ngram_ids, corpus, keys=('title', 'abstract', )):
@param corpus: the CORPUS node
@param keys: the hyperdata fields to index
"""
# check the ngrams we won't process (those that were already indexed)
indexed_ngrams_subquery = (session
.query(distinct(NodeNgram.ngram_id))
.join(Node, Node.id == NodeNgram.node_id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == 'DOCUMENT')
.subquery()
)
# FIXME too slow: index_new_ngrams should be faster via tsvector on DB
"""
# retrieve the ngrams from our list, filtering out the already indexed ones
# retrieve *all* the ngrams from our list
# (even if some relations may be already indexed
# b/c they were perhaps not extracted in all docs
# => we'll use already_indexed later)
todo_ngrams = (session
.query(Ngram)
.filter(Ngram.id.in_(ngram_ids))
.filter(~ Ngram.id.in_(indexed_ngrams_subquery))
.all()
)
......@@ -62,7 +60,11 @@ def index_new_ngrams(ngram_ids, corpus, keys=('title', 'abstract', )):
node_ngram_to_write = {}
# loop throught the docs and their text fields
for doc in corpus.children('DOCUMENT'):
for (i, doc) in enumerate(corpus.children('DOCUMENT')):
if (i % 100 == 0):
print('CORPUS #%d: [%s] ngrams_addition: doc %i' % (corpus.id, t(), i))
print()
# a new empty counting subdict
node_ngram_to_write[doc.id] = {}
......@@ -90,22 +92,49 @@ def index_new_ngrams(ngram_ids, corpus, keys=('title', 'abstract', )):
else:
node_ngram_to_write[doc.id][ngram.id] += n_occs
# debug
# print("new node_ngrams before filter:", node_ngram_to_write)
# check the relations we won't insert (those that were already indexed)
# NB costly but currently impossible with bulk_insert_ifnotexists
# b/c double uniquekey
already_indexed = (session
.query(NodeNgram.node_id, NodeNgram.ngram_id)
.join(Node, Node.id == NodeNgram.node_id)
.filter(Node.parent_id == corpus.id)
.filter(Node.typename == 'DOCUMENT')
.all()
)
filter_out = {(nd_id,ng_id) for (nd_id,ng_id) in already_indexed}
# POSSIBLE update those that are filtered out if wei_previous != wei
# integrate all at the end
my_new_rows = []
add_new_row = my_new_rows.append
for doc_id in node_ngram_to_write:
for ngram_id in node_ngram_to_write[doc_id]:
wei = node_ngram_to_write[doc_id][ngram_id]
add_new_row([doc_id, ngram_id, wei])
if (doc_id, ngram_id) not in filter_out:
wei = node_ngram_to_write[doc_id][ngram_id]
add_new_row([doc_id, ngram_id, wei])
del node_ngram_to_write
# debug
# print("new node_ngrams after filter:", my_new_rows)
bulk_insert(
table = NodeNgram,
fields = ('node_id', 'ngram_id', 'weight'),
data = my_new_rows
)
# bulk_insert_ifnotexists(
# model = NodeNgram,
# uniquekey = ('node_id','ngram_id'), <= currently impossible
# fields = ('node_id', 'ngram_id', 'weight'),
# data = my_new_rows
# )
n_added = len(my_new_rows)
print("index_new_ngrams: added %i new NodeNgram rows" % n_added)
......
from rest_framework.status import *
from rest_framework.exceptions import APIException
from rest_framework.response import Response
from rest_framework.renderers import JSONRenderer, BrowsableAPIRenderer
from rest_framework.views import APIView
from rest_framework.authentication import SessionAuthentication, BasicAuthentication
from rest_framework.permissions import IsAuthenticated
from gargantext.constants import RESOURCETYPES, NODETYPES, get_resource
from gargantext.models import Node, Ngram, NodeNgram, NodeNodeNgram, NodeNode
from gargantext.util.db import session, delete, func, bulk_insert
from gargantext.util.db_cache import cache, or_
from gargantext.util.files import upload
from gargantext.util.http import ValidationException, APIView, JsonHttpResponse, get_parameters
from gargantext.util.scheduling import scheduled
from gargantext.util.validation import validate
#import
#NODES format
_user_default_fields =["is_staff","is_superuser","is_active", "username", "email", "first_name", "last_name", "id"]
_api_default_fields = ['id', 'parent_id', 'name', 'typename', 'date']
_doc_default_fields = ['id', 'parent_id', 'name', 'typename', 'date', "hyperdata"]
#_resource_default_fields = [['id', 'parent_id', 'name', 'typename', "hyperdata.method"]
#_corpus_default_fields = ['id', 'parent_id', 'name', 'typename', 'date', "hyperdata","resource"]
def format_parent(node):
'''format the parent'''
try:
#USER
if node.username != "":
return {field: getattr(node, field) for field in _user_default_fields}
except:
#DOC
if node.typename == "DOCUMENT":
return {field: getattr(node, field) for field in _doc_default_fields}
elif node.typename == "CORPUS":
parent = {field: getattr(node, field) for field in _doc_default_fields}
#documents
#parent["documents"] = {"count":node.children("DOCUMENT").count()}
#resources
#parent["resources"] = {"count":node.children("RESOURCE").count()}
#status
#return {field: getattr(node, field) for field in _doc_default_fields}
parent["status_msg"] = status_message
return parent
#PROJECT, RESOURCES?
else:
return {field: getattr(node, field) for field in _api_default_fields}
def format_records(node_list):
'''format the records list'''
if len(node_list) == 0:
return []
node1 = node_list[0]
#USER
if node1.typename == "USER":
return [{field: getattr(node, field) for field in _user_default_fields} for node in node_list]
#DOCUMENT
elif node1.typename == "DOCUMENT":
return [{field: getattr(node, field) for field in _doc_default_fields} for node in node_list]
#CORPUS, PROJECT, RESOURCES?
elif node1.typename == "CORPUS":
records = []
for node in node_list:
#PROJECTS VIEW SHOULD NE BE SO DETAILED
record = {field: getattr(node, field) for field in _doc_default_fields}
record["resources"] = [n.id for n in node.children("RESOURCE")]
record["documents"] = [n.id for n in node.children("DOCUMENT")]
#record["resources"] = format_records([n for n in node.children("RESOURCE")])
#record["documents"] = format_records([n for n in node.children("DOCUMENT")])
status = node.status()
if status is not None and not status['complete']:
if not status['error']:
status_message = '(in progress: %s, %d complete)' % (
status['action'].replace('_', ' '),
status['progress'],
)
else:
status_message = '(aborted: "%s" after %i docs)' % (
status['error'][-1],
status['progress']
)
else:
status_message = ''
record["status"] = status_message
records.append(record)
return records
else:
return [{field: getattr(node, field) for field in _api_default_fields} for node in node_list]
def check_rights(request, node_id):
'''check that the node belong to USER'''
node = session.query(Node).filter(Node.id == node_id).first()
if node is None:
raise APIException("403 Unauthorized")
# return Response({'detail' : "Node #%s not found" %(node_id) },
# status = status.HTTP_404_NOT_FOUND)
elif node.user_id != request.user.id:
#response_data = {"log": "Unauthorized"}
#return JsonHttpResponse(response_data, status=403)
raise APIException("403 Unauthorized")
else:
return node
def format_response(parent, records):
#print(records)
return { "parent": format_parent(parent),
"records": format_records(records),
"count":len(records)
}
from django.core.exceptions import *
from .api import * #APIView, APIException entre autres
from gargantext.util.db import session
from gargantext.models import Node
from gargantext.util.http import *
class CorpusView(APIView):
'''API endpoint that represent a corpus'''
def get(self, request, project_id, corpus_id, view = "DOCUMENT"):
'''GET corpus detail
default view full documents
'''
params = get_parameters(request)
if "view" in params.keys():
filter_view = params["view"].upper()
if view in ["DOCUMENT", "JOURNAL", "TITLE", "ANALYTICS", "RESSOURCE"]:
view = filter_view
project = session.query(Node).filter(Node.id == project_id, Node.typename == "PROJECT").first()
check_rights(request, project.id)
if project is None:
return Response({'detail' : "PROJECT Node #%s not found" %(project_id) },
status = status.HTTP_404_NOT_FOUND)
corpus = session.query(Node).filter(Node.id == corpus_id, Node.typename == "CORPUS").first()
if corpus is None:
return Response({'detail' : "CORPUS Node #%s not found" %(corpus_id) },
status = status.HTTP_404_NOT_FOUND)
documents = session.query(Node).filter(Node.parent_id == corpus_id, Node.typename == view).all()
context = format_response(corpus, documents)
return Response(context)
def delete(self, request, project_id, corpus_id):
'''DELETE corpus'''
print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>delete")
# project = session.query(Node).filter(Node.id == project_id, Node.typename == "PROJECT").first()
# check_rights(request, project.id)
# if project is None:
# return Response({'detail' : "PROJECT Node #%s not found" %(project_id) },
# status = status.HTTP_404_NOT_FOUND)
corpus = session.query(Node).filter(Node.id == corpus_id, Node.typename == "CORPUS").first()
if corpus is None:
return Response({'detail' : "CORPUS Node #%s not found" %(corpus_id) },
status = status.HTTP_404_NOT_FOUND)
documents = session.query(Node).filter(Node.parent_id == corpus_id).all()
session.delete(documents)
session.delete(corpus)
session.commit()
return Response(detail="Deleted corpus #%s" %str(corpus_id), status=HTTP_204_NO_CONTENT)
def put(self, request, project_id, corpus_id, view="DOCUMENT"):
'''UPDATE corpus'''
project = session.query(Node).filter(Node.id == project_id, Node.typename == "PROJECT").first()
project = check_rights(request, project.id)
if project is None:
return Response({'detail' : "PROJECT Node #%s not found" %(project_id) },
status = status.HTTP_404_NOT_FOUND)
corpus = session.query(Node).filter(Node.id == corpus_id, Node.typename == "CORPUS").first()
if corpus is None:
return Response({'detail' : "CORPUS Node #%s not found" %(corpus_id) },
status = status.HTTP_404_NOT_FOUND)
#documents = session.query(Node).filter(Node.parent_id == corpus_id, Node.typename= view).all()
for key, val in request.data.items():
if key in ["name", "date", "username", "hyperdata"]:
if key == "username":
#changement de propriétaire
#user = session.query(Node).filter(Node.typename=="USER", Node.username== username).first()
#print(user)
#set(node, user_id, user.id)
pass
elif key == "hyperdata":
#updating some contextualvalues of the corpus
pass
else:
setattr(node, key, val)
session.add(node)
session.commit()
'''#updating children???
'''
return Response({"detail":"Updated corpus #" %str(corpus.id)}, status=HTTP_202_ACCEPTED)
def post(self, request, project_id, corpus_id):
'''ADD a new RESOURCE to CORPUS'''
project = session.query(Node).filter(Node.id == project_id, Node.typename == "PROJECT").first()
check_rights(request, project.id)
if project is None:
return Response({'detail' : "PROJECT Node #%s not found" %(project_id) },
status = status.HTTP_404_NOT_FOUND)
corpus = session.query(Node).filter(Node.id == corpus_id, Node.typename == "CORPUS").first()
if corpus is None:
return Response({'detail' : "CORPUS Node #%s not found" %(corpus_id) },
status = status.HTTP_404_NOT_FOUND)
......@@ -85,8 +85,8 @@ class CSVLists(APIView):
# import the csv
try:
new_lists = import_ngramlists(csv_file)
print("===============================!!!")
print(new_lists)
print("======new_lists=========================!!!")
# print(new_lists) # very long
del csv_file
# merge the new_lists onto those of the target corpus
......@@ -373,6 +373,10 @@ class ListChange(APIView):
def put(self, request):
"""
Adds one or more ngrams to a list.
NB: we assume ngram_ids don't contain subforms !!
(this assumption is not checked here because it would be
slow: if you want to add a subform, send the mainform's id)
"""
# union of items ----------------------------
new_list = self.base_list + self.change_list
......
......@@ -2,8 +2,8 @@ from gargantext.util.http import ValidationException, APIView \
, get_parameters, JsonHttpResponse\
, HttpResponse
from gargantext.util.db import session, func
from gargantext.util.db_cache import cache
from gargantext.models import Node, Ngram, NodeNgram
from gargantext.util.db_cache import cache
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram
from sqlalchemy.orm import aliased
from re import findall
......@@ -21,7 +21,7 @@ class ApiNgrams(APIView):
"""
Used for analytics
------------------
Get ngram listing + counts in a given scope
"""
# parameters retrieval and validation
......@@ -83,24 +83,30 @@ class ApiNgrams(APIView):
1 - checks user authentication before any changes
2 - adds the ngram to Ngram table in DB
2 - checks if ngram to Ngram table in DB
if yes returns ngram_id and optionally mainform_id
otherwise continues
3 - adds the ngram to Ngram table in DB
3 - (if corpus param is present)
4 - (if corpus param is present)
adds the ngram doc counts to NodeNgram table in DB
(aka "index the ngram" throught the docs of the corpus)
4 - returns json with:
'msg' => a success msg
5 - returns json with:
'msg' => a success msg
'text' => the initial text content
'term' => the normalized text content
'id' => the new ngram_id
'count' => the number of docs with the ngram in the corpus
(if corpus param is present)
'group' => the mainform_id if applicable
possible inline parameters
--------------------------
@param text=<ngram_string> [required]
@param corpus=<CORPUS_ID> [optional]
@param testgroup (true if present) [optional, requires corpus]
"""
# 1 - check user authentication
......@@ -122,6 +128,9 @@ class ApiNgrams(APIView):
It requires a "text" parameter,\
for instance /api/ngrams?text=hydrometallurgy')
if ('testgroup' in params) and (not ('corpus' in params)):
raise ValidationException("'testgroup' param requires 'corpus' param")
# if we have a 'corpus' param (to do the indexing)...
do_indexation = False
if 'corpus' in params:
......@@ -143,10 +152,33 @@ class ApiNgrams(APIView):
try:
log_msg = ""
ngram_id = None
mainform_id = None
preexisting = session.query(Ngram).filter(Ngram.terms==ngram_str).first()
if preexisting is not None:
ngram_id = preexisting.id
log_msg += "ngram already existed (id %i)\n" % ngram_id
# in the context of a corpus we can also check if has mainform
# (useful for)
if 'testgroup' in params:
groupings_id = (session.query(Node.id)
.filter(Node.parent_id == corpus_id)
.filter(Node.typename == 'GROUPLIST')
.first()
)
had_mainform = (session.query(NodeNgramNgram.ngram1_id)
.filter(NodeNgramNgram.node_id == groupings_id)
.filter(NodeNgramNgram.ngram2_id == preexisting.id)
.first()
)
if had_mainform:
mainform_id = had_mainform[0]
log_msg += "ngram had mainform (id %i) in this corpus" % mainform_id
else:
log_msg += "ngram was not in any group for this corpus"
else:
# 2 - insert into Ngrams
new_ngram = Ngram(terms=ngram_str, n=ngram_size)
......@@ -165,6 +197,7 @@ class ApiNgrams(APIView):
'text': original_text,
'term': ngram_str,
'id' : ngram_id,
'group' : mainform_id,
'count': n_added if do_indexation else 'no corpus provided for indexation'
}, 200)
......
This diff is collapsed.
This diff is collapsed.
from django.conf.urls import url
from . import nodes
from . import projects
from . import corpora
from . import ngrams
from . import metrics
from . import ngramlists
......@@ -10,7 +12,33 @@ from graph.rest import Graph
urlpatterns = [ url(r'^nodes$' , nodes.NodeListResource.as_view() )
, url(r'^nodes/(\d+)$' , nodes.NodeResource.as_view() )
, url(r'^nodes/(\d+)/having$' , nodes.NodeListHaving.as_view() )
, url(r'^nodes/(\d+)/status$' , nodes.Status.as_view() )
#Projects
, url(r'^projects$' , projects.ProjectList.as_view() )
, url(r'^projects/(\d+)$' , projects.ProjectView.as_view() )
#?view=resource
#?view=docs
#Corpora
, url(r'^projects/(\d+)/corpora/(\d+)$' , corpora.CorpusView.as_view() )
#?view=source
#?view=title
#?view=analytics
#Sources
#, url(r'^projects/(\d+)/corpora/(\d+)/sources$' , corpora.CorpusSources.as_view() )
#, url(r'^projects/(\d+)/corpora/(\d+)/sources/(\d+)$' , corpora.CorpusSourceView.as_view() )
#Facets
, url(r'^projects/(\d+)/corpora/(\d+)/facets$' , nodes.CorpusFacet.as_view() )
#Favorites
, url(r'^projects/(\d+)/corpora/(\d+)/favorites$', nodes.CorpusFavorites.as_view() )
#Metrics
, url(r'^projects/(\d+)/corpora/(\d+)/metrics$', metrics.CorpusMetrics.as_view() )
#GraphExplorer
, url(r'^projects/(\d+)/corpora/(\d+)/explorer$' , Graph.as_view())
# data for graph explorer (json)
# GET /api/projects/43198/corpora/111107/explorer?
# Corresponding view is : /projects/43198/corpora/111107/explorer?
# Parameters (example):
# explorer?field1=ngrams&field2=ngrams&distance=conditional&bridgeness=5&start=1996-6-1&end=2002-10-5
# Ngrams
, url(r'^ngrams/?$' , ngrams.ApiNgrams.as_view() )
......@@ -63,10 +91,5 @@ urlpatterns = [ url(r'^nodes$' , nodes.NodeListResource.as_view()
, url(r'^ngramlists/maplist$' , ngramlists.MapListGlance.as_view() )
# fast access to maplist, similarly formatted for termtable
, url(r'^projects/(\d+)/corpora/(\d+)/explorer$' , Graph.as_view())
# data for graph explorer (json)
# GET /api/projects/43198/corpora/111107/explorer?
# Corresponding view is : /projects/43198/corpora/111107/explorer?
# Parameters (example):
# explorer?field1=ngrams&field2=ngrams&distance=conditional&bridgeness=5&start=1996-6-1&end=2002-10-5
]
......@@ -28,6 +28,8 @@ def docs_by_titles(request, project_id, corpus_id):
authorized, user, project, corpus = _get_user_project_corpus(request, project_id, corpus_id)
if not authorized:
return HttpResponseForbidden()
source_type = corpus.resources()[0]['type']
# response!
return render(
template_name = 'pages/corpora/titles.html',
......@@ -37,18 +39,18 @@ def docs_by_titles(request, project_id, corpus_id):
'date': datetime.now(),
'project': project,
'corpus': corpus,
'resourcename' : get_resource_by_name(corpus.resources()[0]),
'resourcename' : get_resource(source_type)['name'],
'view': 'titles',
'user': request.user
},
)
@requires_auth
def docs_by_journals(request, project_id, corpus_id):
def docs_by_sources(request, project_id, corpus_id):
'''
Browse journal titles for a given corpus
NB: javascript in page will GET counts from our api: facets?subfield=journal
# TODO refactor Journals_dyna_charts_and_table.js
Browse source titles for a given corpus
NB: javascript in page will GET counts from our api: facets?subfield=source
# TODO refactor Sources_dyna_charts_and_table.js
'''
# we pass our corpus to mark it's a corpora page
corpus = cache.Node[corpus_id]
......@@ -56,17 +58,19 @@ def docs_by_journals(request, project_id, corpus_id):
# and the project just for project.id in corpusBannerTop
project = cache.Node[project_id]
# rendered page : journals.html
source_type = corpus.resources()[0]['type']
# rendered page : sources.html
return render(
template_name = 'pages/corpora/journals.html',
template_name = 'pages/corpora/sources.html',
request = request,
context = {
'debug': settings.DEBUG,
'date': datetime.now(),
'project': project,
'corpus' : corpus,
'resourcename' : get_resource_by_name(corpus.resources()[0]),
'view': 'journals'
'resourcename' : get_resource(source_type)['name'],
'view': 'sources'
},
)
......@@ -75,7 +79,7 @@ def docs_by_authors(request, project_id, corpus_id):
'''
Browse authors for a given corpus
NB: javascript in page will GET counts from our api: facets?subfield=author
# TODO refactor Author && Journals_dyna_charts_and_table.js
# TODO refactor Author && Sources_dyna_charts_and_table.js
'''
# we pass our corpus to mark it's a corpora page
corpus = cache.Node[corpus_id]
......@@ -83,7 +87,9 @@ def docs_by_authors(request, project_id, corpus_id):
# and the project just for project.id in corpusBannerTop
project = cache.Node[project_id]
# rendered page : journals.html
source_type = corpus.resources()[0]['type']
# rendered page : sources.html
return render(
template_name = 'pages/corpora/authors.html',
request = request,
......@@ -92,7 +98,7 @@ def docs_by_authors(request, project_id, corpus_id):
'date': datetime.now(),
'project': project,
'corpus' : corpus,
'resourcename' : get_resource_by_name(corpus.resources()[0]),
'resourcename' : get_resource(source_type)['name'],
'view': 'authors'
},
)
......@@ -103,6 +109,9 @@ def analytics(request, project_id, corpus_id):
authorized, user, project, corpus = _get_user_project_corpus(request, project_id, corpus_id)
if not authorized:
return HttpResponseForbidden()
source_type = corpus.resources()[0]['type']
# response!
return render(
template_name = 'pages/analytics/histories.html',
......@@ -112,7 +121,7 @@ def analytics(request, project_id, corpus_id):
'date': datetime.now(),
'project': project,
'corpus': corpus,
'resourcename' : get_resource_by_name(corpus.resources()[0]),
'resourcename' : get_resource(source_type)['name'],
'view': 'analytics',
'user': request.user
},
......
......@@ -54,6 +54,9 @@ def overview(request):
# projects owned by the user's contacts
'common_users': (contact for contact, projects in contacts_projects),
'common_projects': sum((projects for contact, projects in contacts_projects), []),
# status refreshing params (when active workflows)
'status_refresh_initial_interval': PROJECT_VIEW_REFRESH_INTERVAL,
'status_refresh_max_attempts': PROJECT_VIEW_MAX_REFRESH_ATTEMPTS,
},
)
......@@ -64,10 +67,14 @@ class NewCorpusForm(forms.Form):
source_list.insert(0, (0,"Select a database below"))
type = forms.ChoiceField(
choices = source_list,
widget = forms.Select(attrs={ 'onchange' :'CustomForSelect( $("option:selected", this).text() );'})
widget = forms.Select(attrs={ 'onchange' :'CustomForSelect(this.value); checkReady()'})
)
name = forms.CharField( label='Name', max_length=199 ,
widget = forms.TextInput(attrs={ 'required': 'true', 'onkeyup':'checkReady()' })
)
file = forms.FileField(
widget = forms.FileInput(attrs={ 'onchange':'checkReady()' })
)
name = forms.CharField( label='Name', max_length=199 , widget=forms.TextInput(attrs={ 'required': 'true' }))
file = forms.FileField()
def clean_resource(self):
file_ = self.cleaned_data.get('file')
def clean_file(self):
......@@ -171,6 +178,9 @@ def project(request, project_id):
'cooclists': [],
'number': len(corpora),
'query_size': QUERY_SIZE_N_DEFAULT,
# status refreshing params (when active workflows)
'status_refresh_initial_interval': PROJECT_VIEW_REFRESH_INTERVAL,
'status_refresh_max_attempts': PROJECT_VIEW_MAX_REFRESH_ATTEMPTS,
},
)
# response!
......@@ -189,5 +199,8 @@ def project(request, project_id):
'cooclists': [],
'number': len(corpora),
'query_size': QUERY_SIZE_N_DEFAULT,
# status refreshing params (when active workflows)
'status_refresh_initial_interval': PROJECT_VIEW_REFRESH_INTERVAL,
'status_refresh_max_attempts': PROJECT_VIEW_MAX_REFRESH_ATTEMPTS,
},
)
......@@ -2,7 +2,7 @@ from gargantext.util.http import requires_auth, render, settings
from gargantext.util.db import session
from gargantext.util.db_cache import cache
from gargantext.models import Node
from gargantext.constants import get_resource_by_name
from gargantext.constants import get_resource
from datetime import datetime
@requires_auth
......@@ -31,6 +31,7 @@ def ngramtable(request, project_id, corpus_id):
# .filter(Node.id != corpus_id)
corpora_infos = corpora_infos_q.all()
source_type = corpus.resources()[0]['type']
# rendered page : terms.html
return render(
......@@ -42,7 +43,7 @@ def ngramtable(request, project_id, corpus_id):
'date': datetime.now(),
'project': project,
'corpus' : corpus,
'resourcename' : get_resource_by_name(corpus),
'resourcename' : get_resource(source_type)['name'],
'view': 'terms',
# for the CSV import modal
......
......@@ -24,10 +24,10 @@ urlpatterns = [
# corpora
url(r'^projects/(\d+)/corpora/(\d+)/?$', corpora.docs_by_titles),
# corpus by journals
url(r'^projects/(\d+)/corpora/(\d+)/journals/?$', corpora.docs_by_journals),
# corpus by sources
url(r'^projects/(\d+)/corpora/(\d+)/sources/?$', corpora.docs_by_sources),
# corpus by journals
# corpus by authors
url(r'^projects/(\d+)/corpora/(\d+)/authors/?$', corpora.docs_by_authors),
# terms table for the corpus
......
Module Graph Explorer: from text to graph
=========================================
## How to contribute ?
Some solutions:
1) please report to dev@gargantext.org
2) fix with git repo and pull request
## Graph Explorer main
0) All urls.py of the Graph Explorer
1) Main view of the graph explorer: views.py
2) Data are retrieved as REST: rest.py
3) Graph is generated (graph.py) through different steps
-> Graph Explorer
-> My graph View
-> REST API to get Data
2) Graph is generated (graph.py) through different steps
a) check the constraints (graph_constraints) in gargantext/constants.py
b) Cooccurences are computed (in live or asynchronously): cooccurrences.py
c) Thresold and distances : distances.py
d) clustering: louvain.py
c) links between communities: bridgeness.py
b) Data are retrieved as REST
rest.py: check REST parameters
c) graph.py:
get_graph: check Graph parameters
compute_graph: compute graph
1) Cooccurences are computed (in live or asynchronously): cooccurrences.py
2) Thresold and distances : distances.py
3) clustering: louvain.py
4) links between communities: bridgeness.py
d) compress graph before returning it: utils.py
4) Additional features:
a) intersection of graphs: intersection.py
## How to contribute ?
Some solutions:
1) please report to dev@gargantext.org
2) fix with git repo and pull request
## TODO
1) save parameters in hyperdata
2) graph explorer:
* save current graph
2) myGraphs view:
myGraphs view:
* progress bar
* Show already computed graphs vs to be computed with parameters
* show parameters
* copy / paste and change some parameters to generate new graph
......@@ -7,20 +7,28 @@ from collections import defaultdict
from networkx.readwrite import json_graph
def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2):
# Data are stored in a dict(), (== hashmap by default for Python)
'''
Bridgeness = measure to control links (bridges) between communities.
'''
# Data are stored in a dict(), (== hashmap by default with Python)
data = dict()
if type == "node_link":
nodesB_dict = {}
for node_id in G.nodes():
#node,type(labels[node])
nodesB_dict [ ids[node_id][1] ] = True
# TODO the query below is not optimized (do it do_distance).
the_label = session.query(Ngram.terms).filter(Ngram.id==node_id).first()
the_label = ", ".join(the_label)
G.node[node_id]['label'] = the_label
G.node[node_id]['size'] = weight[node_id]
G.node[node_id]['type'] = ids[node_id][0].replace("ngrams","terms")
G.node[node_id]['attributes'] = { "clust_default": partition[node_id]} # new format
# G.add_edge(node, "cluster " + str(partition[node]), weight=3)
......@@ -62,12 +70,20 @@ def filterByBridgeness(G,partition,ids,weight,bridgeness,type,field1,field2):
if bridgeness > 0:
for c1 in com_link.keys():
for c2 in com_link[c1].keys():
index = round(bridgeness*len(com_link[c1][c2]) / (len(com_ids[c1]) + len(com_ids[c2])))
index = round(
bridgeness * len( com_link[c1][c2] )
/ #----------------------------------#
( len(com_ids[c1]) + len(com_ids[c2] ))
)
#print((c1,len(com_ids[c1])), (c2,len(com_ids[c2])), index)
if index > 0:
for link in sorted(com_link[c1][c2], key=lambda x: x[2], reverse=True)[:index]:
for link in sorted( com_link[c1][c2]
, key=lambda x: x[2]
, reverse=True)[:index]:
#print(c1, c2, link[2])
info = {"s": link[0], "t": link[1], "w": link[2]}
links.append(info)
......
from gargantext.models import Node, Ngram, NodeNgram, NodeNgramNgram, \
NodeHyperdata, HyperdataKey
from gargantext.util.db import session, aliased, bulk_insert, func
from gargantext.util.db import session, aliased, func
from gargantext.util.lists import WeightedMatrix, UnweightedList, Translations
from sqlalchemy import desc, asc, or_, and_
from datetime import datetime
#import inspect
import datetime
from celery import shared_task
def filterMatrix(matrix, mapList_id, groupList_id):
mapList = UnweightedList( mapList_id )
mapList = UnweightedList( mapList_id )
group_list = Translations ( groupList_id )
cooc = matrix & (mapList * group_list)
return cooc
@shared_task
def countCooccurrences( corpus_id=None , test= False
def countCooccurrences( corpus_id=None , cooc_id=None
, field1='ngrams' , field2='ngrams'
, start=None , end=None
, mapList_id=None , groupList_id=None
, distance=None , bridgeness=None
, n_min=1, n_max=None , limit=1000
, coocNode_id=None , reset=True
, isMonopartite=True , threshold = 3
, save_on_db= False, # just return the WeightedMatrix,
# (don't write to DB)
, save_on_db= True , reset=True
):
'''
Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
For the moment list of paramters are not supported because, lists need to
For the moment list of parameters are not supported because, lists need to
be merged before.
corpus :: Corpus
mapList_id :: Int
groupList_id :: Int
For the moment, start and end are simple, only year is implemented yet
start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
end :: TimeStamp
limit :: Int
'''
# TODO : add hyperdata here
# Security test
field1,field2 = str(field1), str(field2)
# Parameters to save in hyperdata of the Node Cooc
# FIXME remove the lines below after factorization of parameters
parameters = dict()
parameters['field1'] = field1
parameters['field2'] = field2
......@@ -57,17 +46,17 @@ def countCooccurrences( corpus_id=None , test= False
# Get corpus as Python object
corpus = session.query(Node).filter(Node.id==corpus_id).first()
# Get node
if not coocNode_id:
coocNode_id0 = ( session.query( Node.id )
# Get node of the Graph
if not cooc_id:
cooc_id = ( session.query( Node.id )
.filter( Node.typename == "COOCCURRENCES"
, Node.name == "GRAPH EXPLORER"
, Node.parent_id == corpus.id
)
.first()
)
if not coocNode_id:
if not cooc_id:
coocNode = corpus.add_child(
typename = "COOCCURRENCES",
name = "GRAPH (in corpus %s)" % corpus.id
......@@ -75,14 +64,24 @@ def countCooccurrences( corpus_id=None , test= False
session.add(coocNode)
session.commit()
coocNode_id = coocNode.id
cooc_id = coocNode.id
else :
coocNode_id = coocNode_id[0]
cooc_id = int(cooc_id[0])
if reset == True :
session.query( NodeNgramNgram ).filter( NodeNgramNgram.node_id == coocNode_id ).delete()
# when cooc_id preexisted, but we want to continue (reset = True)
# (to give new contents to this cooc_id)
elif reset:
print("GRAPH #%s ... Counting new cooccurrences data." % cooc_id)
session.query( NodeNgramNgram ).filter( NodeNgramNgram.node_id == cooc_id ).delete()
session.commit()
# when cooc_id preexisted and we just want to load it (reset = False)
else:
print("GRAPH #%s ... Loading cooccurrences computed already." % cooc_id)
cooc = session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id, NodeNgramNgram.weight ).filter( NodeNgramNgram.node_id == cooc_id ).all()
return(int(cooc_id),WeightedMatrix(cooc))
NodeNgramX = aliased(NodeNgram)
......@@ -161,8 +160,8 @@ def countCooccurrences( corpus_id=None , test= False
# Cooc between the dates start and end
if start is not None:
#date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
# TODO : more complexe date format here.
date_start = datetime.datetime.strptime (str(start), "%Y-%m-%d")
# TODO : more precise date format here (day is smaller grain actually).
date_start = datetime.strptime (str(start), "%Y-%m-%d")
date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")
Start=aliased(NodeHyperdata)
......@@ -177,8 +176,8 @@ def countCooccurrences( corpus_id=None , test= False
if end is not None:
# TODO : more complexe date format here.
date_end = datetime.datetime.strptime (str(end), "%Y-%m-%d")
# TODO : more precise date format here (day is smaller grain actually).
date_end = datetime.strptime (str(end), "%Y-%m-%d")
date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")
End=aliased(NodeHyperdata)
......@@ -208,22 +207,29 @@ def countCooccurrences( corpus_id=None , test= False
#cooc_query = cooc_query.order_by(desc('cooc_score'))
matrix = WeightedMatrix(cooc_query)
print("GRAPH #%s Filtering the matrix with Map and Group Lists." % cooc_id)
cooc = filterMatrix(matrix, mapList_id, groupList_id)
parameters['MapList_id'] = str(mapList_id)
parameters['GroupList_id'] = str(mapList_id)
parameters['MapList_id'] = str(mapList_id)
parameters['GroupList_id'] = str(groupList_id)
# TODO factorize savings on db
if save_on_db:
# Saving cooc Matrix
cooc.save(coocNode_id)
# Saving the cooccurrences
cooc.save(cooc_id)
print("GRAPH #%s ... Node Cooccurrence Matrix saved" % cooc_id)
# Saving the parameters
coocNode = session.query(Node).filter(Node.id==coocNode_id).first()
coocNode.hyperdata = parameters
session.add(coocNode)
print("GRAPH #%s ... Parameters saved in Node." % cooc_id)
coocNode = session.query(Node).filter(Node.id==cooc_id).first()
coocNode.hyperdata["parameters"] = dict()
coocNode.hyperdata["parameters"] = parameters
coocNode.save_hyperdata()
session.commit()
# Log message
print("Cooccurrence Matrix saved")
return cooc
#data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness)
#return data
return(coocNode.id, cooc)
......@@ -16,16 +16,16 @@ import networkx as nx
def clusterByDistances( cooc_matrix
, field1=None, field2=None
, distance='conditional'):
, distance=None):
'''
do_distance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight})
clusterByDistance :: Coocs[nga, ngb => ccweight] -> (Graph, Partition, {ids}, {weight})
'''
# implicit global session
authorized = ['conditional', 'distributional', 'cosine']
if distance not in authorized:
distance = 'conditional'
raise ValueError("Distance must be in %s" % str(authorized))
matrix = defaultdict(lambda : defaultdict(float))
ids = defaultdict(lambda : defaultdict(int))
......
This diff is collapsed.
from gargantext.models.users import User
from gargantext.util.db import session
from django.core.mail import send_mail
from gargantext.settings import BASE_URL
def notify_owner(corpus,cooc_id,distance,bridgeness):
user = session.query(User).filter(User.id == corpus.user_id).first()
message = '''
Bonjour,
votre graph vient de se terminer dans votre corpus intitulé:
%s
Vous pouvez accéder et renommer votre Graph à l'adresse:
http://%s/projects/%d/corpora/%d/explorer?cooc_id=%d&distance=%s&bridgeness=%d
Nous restons à votre disposition pour tout complément d'information.
Cordialement
--
L'équipe de Gargantext (CNRS)
''' % (corpus.name, BASE_URL, corpus.parent_id, corpus.id, cooc_id, distance, bridgeness)
if user.email != "" :
send_mail('[Gargantext] Votre Graph est calculé'
, message
, 'team@gargantext.org'
, [user.email], fail_silently=False )
else:
print("User %s (%d), has no email" % (user.username, user.id) )
#from rest_framework.authentication import SessionAuthentication, BasicAuthentication
from gargantext.util.db import session
from gargantext.models.nodes import Node
from graph.graph import get_graph
from graph.utils import compress_graph, format_html
from gargantext.util.http import APIView, APIException\
, JsonHttpResponse, requires_auth
from gargantext.constants import graph_constraints
from traceback import format_tb
def compress_graph(graphdata):
"""
graph data is usually a dict with 2 slots:
"nodes": [{"id":4103, "type":"terms", "attributes":{"clust_default": 0}, "size":29, "label":"regard"},...]
"links": [{"t": 998,"s": 768,"w": 0.0425531914893617},...]
To send this data over the net, this function can reduce a lot of its size:
- keep less decimals for float value of each link's weight
- use shorter names for node properties (eg: s/clust_default/cl/)
result format:
"nodes": [{"id":4103, "at":{"cl": 0}, "s":29, "lb":"regard"},...]
"links": [{"t": 998,"s": 768,"w": 0.042},...]
"""
for link in graphdata['links']:
link['w'] = format(link['w'], '.3f') # keep only 3 decimals
for node in graphdata['nodes']:
node['lb'] = node['label']
del node['label']
node['at'] = node['attributes']
del node['attributes']
node['at']['cl'] = node['at']['clust_default']
del node['at']['clust_default']
node['s'] = node['size']
del node['size']
if node['type'] == "terms":
# its the default type for our format: so we don't need it
del node['type']
else:
node['t'] = node['type']
del node['type']
return graphdata
def format_html(link):
"""
Build an html link adapted to our json message format
"""
return "<a class='msglink' href='%s'>%s</a>" % (link, link)
# TODO check authentication
from traceback import format_tb
class Graph(APIView):
'''
......@@ -67,11 +18,30 @@ class Graph(APIView):
Get all the parameters first
graph?field1=ngrams&field2=ngrams&
graph?field1=ngrams&field2=ngrams&start=''&end=''
NB save new graph mode
(option saveOnly=True without a cooc_id)
can return the new cooc id in the json
before counting + filling data in async
'''
if not request.user.is_authenticated():
# can't use @requires_auth because of positional 'self' within class
return HttpResponse('Unauthorized', status=401)
# Get the node we are working with
corpus = session.query(Node).filter(Node.id==corpus_id).first()
# TODO Parameters to save in hyperdata of the Node Cooc
# WARNING: we could factorize the parameters as dict but ...
# ... it causes a bug in asynchronous function !
# Check celery upgrades before.
# Example (for the future):
# parameters = dict()
# parameters['field1'] = field1
# parameters['field2'] = field2
# Get all the parameters in the URL
cooc_id = request.GET.get ('cooc_id' , None )
saveOnly = request.GET.get ('saveOnly' , None )
......@@ -91,8 +61,7 @@ class Graph(APIView):
type_ = str(request.GET.get ('type' , 'node_link' ))
distance = str(request.GET.get ('distance' , 'conditional'))
# Get default value if no map list
# Get default map List of corpus
if mapList_id == 0 :
mapList_id = ( session.query ( Node.id )
.filter( Node.typename == "MAPLIST"
......@@ -104,7 +73,6 @@ class Graph(APIView):
mapList_id = mapList_id[0]
if mapList_id == None :
# todo add as an error msg ?
raise ValueError("MAPLIST node needed for cooccurrences")
......@@ -120,36 +88,26 @@ class Graph(APIView):
groupList_id = groupList_id[0]
if groupList_id == None :
# todo add as an error msg ?
raise ValueError("GROUPLIST node needed for cooccurrences")
# Check the options
# Declare accepted fields
accepted_field1 = ['ngrams', 'journal', 'source', 'authors']
accepted_field2 = ['ngrams', ]
options = ['start', 'end', 'threshold', 'distance', 'cooc_id' ]
try:
# Test params
# Check if parameters are accepted
if (field1 in accepted_field1) and (field2 in accepted_field2):
if start is not None and end is not None :
data = get_graph( corpus=corpus, cooc_id = cooc_id
#, field1=field1 , field2=field2
, mapList_id = mapList_id , groupList_id = groupList_id
, start=start , end=end
, threshold =threshold , distance=distance
, saveOnly=saveOnly
)
else:
data = get_graph( corpus = corpus, cooc_id = cooc_id
#, field1=field1, field2=field2
, mapList_id = mapList_id , groupList_id = groupList_id
, threshold = threshold
, distance = distance
, bridgeness = bridgeness
, saveOnly=saveOnly
)
data = get_graph( corpus=corpus, cooc_id = cooc_id
, field1=field1 , field2=field2
, mapList_id = mapList_id , groupList_id = groupList_id
, start=start , end=end
, threshold =threshold
, distance=distance , bridgeness=bridgeness
, saveOnly=saveOnly
)
# data :: Either (Dic Nodes Links) (Dic State Length)
......@@ -173,10 +131,12 @@ class Graph(APIView):
# async data case
link = "http://%s/projects/%d/corpora/%d/myGraphs" % (request.get_host(), corpus.parent_id, corpus.id)
return JsonHttpResponse({
'msg': '''Your graph is saved:
'id': data["target_id"],
'name': data["target_name"],
'date': data["target_date"],
'msg': '''Your graph is being saved:
%s
''' % format_html(link),
''' % format_html(link)
}, status=200)
elif data["state"] == "corpusMin":
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment