- Corrected a bug in FileParser: no more blank spaces between letters in the ngrams

- Added some tests for Notebook

- Corrected a bug in FileParser: no more blank spaces between letters in the ngrams
- Added some tests for Notebook
2ca5116a · Mathieu Rodic · f765c62f · 2ca5116a · 2ca5116a · 2ca5116a
Commit 2ca5116a authored Oct 22, 2014 by Mathieu Rodic
3 changed files
--- a/.ipynb_checkpoints/Languages integration (only run once!)-checkpoint.ipynb
+++ b/.ipynb_checkpoints/Languages integration (only run once!)-checkpoint.ipynb
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:7c80ed9f4b088e13444efb451a1ee46e5727247be14aaf30ddf0236a49ac461b"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": []
+}
\ No newline at end of file
--- a/.ipynb_checkpoints/test_parser_ngramextractor (Mat)-checkpoint.ipynb
+++ b/.ipynb_checkpoints/test_parser_ngramextractor (Mat)-checkpoint.ipynb
+{
+ "metadata": {
+  "name": "",
+  "signature": "sha256:e0c3b2efe7c205a29dc4e028b10ffb7b9d0569f35c4b426febdf523069abffdb"
+ },
+ "nbformat": 3,
+ "nbformat_minor": 0,
+ "worksheets": [
+  {
+   "cells": [
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "from pprint import pprint\n",
+      "from node.models import Node, NodeType, Language, Ngram\n",
+      "from django.contrib.auth.models import User\n",
+      "import parsing\n",
+      "from parsing.FileParsers import *"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 1
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "# Define user\n",
+      "try:\n",
+      "    user = User.objects.get(username='Mat')\n",
+      "except:\n",
+      "    user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
+      "    user.save()\n",
+      "\n",
+      "# Define document types\n",
+      "nodetypes = {}\n",
+      "for name in ['Corpus', 'Document']:\n",
+      "    try:\n",
+      "        nodetypes[name] = NodeType.objects.get(name=name)\n",
+      "    except:\n",
+      "        nodetypes[name] = NodeType(name=name)\n",
+      "        nodetypes[name].save()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 2
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "Node.objects.all().delete()\n",
+      "corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
+      "corpus.save()"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [],
+     "prompt_number": 3
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "fileparser.parse(corpus)\n",
+      "print('Ok!')"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": [
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "Warning: parsing empty text\n",
+        "Warning: parsing empty text\n",
+        "Warning: parsing empty text"
+       ]
+      },
+      {
+       "output_type": "stream",
+       "stream": "stdout",
+       "text": [
+        "\n",
+        "Warning: parsing empty text\n",
+        "Warning: parsing empty text"
+       ]
+      }
+     ]
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [
+      "for node_ngram in corpus.children.first().node_ngram_set.all():\n",
+      "    print(node_ngram.ngram.terms)"
+     ],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    },
+    {
+     "cell_type": "code",
+     "collapsed": false,
+     "input": [],
+     "language": "python",
+     "metadata": {},
+     "outputs": []
+    }
+   ],
+   "metadata": {}
+  }
+ ]
+}
\ No newline at end of file
--- a/parsing/FileParsers/FileParser.py
+++ b/parsing/FileParsers/FileParser.py
@@ -114,8 +114,7 @@ class FileParser:
        # we are already in a transaction, so no use doing another one (or is there?)
        ngramcache = self._ngramcaches[language]
        for terms, occurences in ngrams.items():
-            ngram_text = ' '.join([term[0] for term in terms])
-            ngram = ngramcache[ngram_text]
+            ngram = ngramcache[terms]
            Node_Ngram(
                node       = childNode,
                ngram      = ngram,