Merge branch 'mat' of ssh://delanoe.org:1979/gargantext

832351f8 · Administrator · 4387de88 · 2ca5116a · 832351f8 · 832351f8
Commit 832351f8 authored Oct 22, 2014 by Administrator
4 changed files
--- a/.ipynb_checkpoints/Languages integration (only run once!)-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Languages integration (only run once!)-checkpoint.ipynb
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:7c80ed9f4b088e13444efb451a1ee46e5727247be14aaf30ddf0236a49ac461b"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": []
+}
\ No newline at end of file
--- a/.ipynb_checkpoints/test_parser_ngramextractor (Mat)-checkpoint.ipynb
+++ b/.ipynb_checkpoints/test_parser_ngramextractor (Mat)-checkpoint.ipynb
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:e0c3b2efe7c205a29dc4e028b10ffb7b9d0569f35c4b426febdf523069abffdb"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from pprint import pprint\n",
+      "from node.models import Node, NodeType, Language, Ngram\n",
+      "from django.contrib.auth.models import User\n",
+      "import parsing\n",
+      "from parsing.FileParsers import *"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 1
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "# Define user\n",
+      "try:\n",
+      "    user = User.objects.get(username='Mat')\n",
+      "except:\n",
+      "    user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
+      "    user.save()\n",
+      "\n",
+      "# Define document types\n",
+      "nodetypes = {}\n",
+      "for name in ['Corpus', 'Document']:\n",
+      "    try:\n",
+      "        nodetypes[name] = NodeType.objects.get(name=name)\n",
+      "    except:\n",
+      "        nodetypes[name] = NodeType(name=name)\n",
+      "        nodetypes[name].save()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 2
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "Node.objects.all().delete()\n",
+      "corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
+      "corpus.save()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 3
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "fileparser.parse(corpus)\n",
+      "print('Ok!')"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "Warning: parsing empty text\n",
+        "Warning: parsing empty text\n",
+        "Warning: parsing empty text"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "Warning: parsing empty text\n",
+        "Warning: parsing empty text"
+       ]
+      }
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "for node_ngram in corpus.children.first().node_ngram_set.all():\n",
+      "    print(node_ngram.ngram.terms)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}
\ No newline at end of file
--- a/init/README.rst
+++ b/init/README.rst
+Install the requirements
+------------------------
+1)  Install all the Debian packages listed in dependances.deb
+    (also: sudo apt-get install postgresql-contrib)
+2)  Create a virtual enironnement with pyvenv: apt-get install python-virtualenv
+3)  Type: source [your virtual environment directory]/bin/activate
+4)  Do your work!
+5)  Type: deactivate
+Configure stuff
+---------------
+1)  ln -s [the project folder] /srv/gargantext
+2)  ln -s [your folder for tree tagger] [the project folder]/parsing/Tagger/treetagger
+Warning: for ln, path has to be absolute!
+In PostreSQL
+-------------
+1)  Ensure postgres is started: sudo /etc/init.d/postgresql start
+2)  sudo su postgres
+3)  psql
+4)  CREATE USER alexandre WITH PASSWORD 'C8kdcUrAQy66U';
+    (see gargantext_web/settings.py, DATABASES = { ... })
+5)  CREATE DATABASE gargandb WITH OWNER alexandre;
+6)  Ctrl + D
+7)  psql gargandb
+6)  CREATE EXTENSION hstore;
+7)  Ctrl + D
+Populate the database
+---------------------
+python manage.py syncdb
+Start the Python Notebook server
+--------------------------------
+1)  In Pyvenv: python manage.py shell_plus --notebook
+2)  Work from your browser!
+Start the Django server
+-----------------------
+python manage.py runserver
\ No newline at end of file
--- a/parsing/FileParsers/FileParser.py
+++ b/parsing/FileParsers/FileParser.py
@@ -114,8 +114,7 @@ class FileParser:
        # we are already in a transaction, so no use doing another one (or is there?)
        ngramcache = self._ngramcaches[language]
        for terms, occurences in ngrams.items():
-            ngram_text = ' '.join([term[0] for term in terms])
+            ngram = ngramcache[terms]
-            ngram = ngramcache[ngram_text]
            Node_Ngram(
                node       = childNode,
                ngram      = ngram,