Commit 6c4a607c authored by Mathieu Rodic's avatar Mathieu Rodic

Merge branch 'mat' of ssh://delanoe.org:1979/gargantext

parents 75a84f95 2ca5116a
{
"metadata": {
"name": "",
"signature": "sha256:7c80ed9f4b088e13444efb451a1ee46e5727247be14aaf30ddf0236a49ac461b"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": []
}
\ No newline at end of file
{
"metadata": {
"name": "",
"signature": "sha256:490e1bc5ac44087c1b3f82ca74e40f42f49bd3910f79a088af19c708d73c63e0"
"signature": "sha256:a5146fbde2b6bf2e3ed4e2bdddfb62662f99272f26e82bf86110680ff3595332"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": []
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from node.models import Node, NodeType, Language\n",
"import parsing\n",
"from parsing.FileParsers import *"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"node = Node.objects.get(name=\"PubMed corpus\")"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser = PubmedFileParser.PubmedFileParser(file='/var/www/gargantext/media/' + node.fichier.name)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser.parse(node)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "Cannot assign \"24\": \"Node.user\" must be a \"User\" instance.",
"output_type": "pyerr",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-4-8c1443001599>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mfileparser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnode\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m/srv/gargantext/parsing/FileParsers/PubmedFileParser.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(self, parentNode, tag)\u001b[0m\n\u001b[0;32m 45\u001b[0m \u001b[0mlanguage\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_languages_iso3\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mmetadata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"language_iso3\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 46\u001b[0m \u001b[0mmetadata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 47\u001b[1;33m \u001b[0mguid\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m\"doi\"\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 48\u001b[0m )\n\u001b[0;32m 49\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mdocument\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/srv/gargantext/parsing/FileParsers/FileParser.py\u001b[0m in \u001b[0;36mcreate_document\u001b[1;34m(self, parentNode, title, contents, language, metadata, guid)\u001b[0m\n\u001b[0;32m 100\u001b[0m \u001b[0mmetadata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmetadata\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 101\u001b[0m \u001b[1;31m#resource = resource,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 102\u001b[1;33m \u001b[0mparent\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparentNode\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 103\u001b[0m )\n\u001b[0;32m 104\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/django/db/models/base.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 403\u001b[0m \u001b[1;31m# \"user_id\") so that the object gets properly cached (and type\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 404\u001b[0m \u001b[1;31m# checked) by the RelatedObjectDescriptor.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 405\u001b[1;33m \u001b[0msetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfield\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrel_obj\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 406\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 407\u001b[0m \u001b[0msetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfield\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mattname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mval\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m/home/alexandre/projets/gargantext.py/env/lib/python3.4/site-packages/django/db/models/fields/related.py\u001b[0m in \u001b[0;36m__set__\u001b[1;34m(self, instance, value)\u001b[0m\n\u001b[0;32m 337\u001b[0m raise ValueError('Cannot assign \"%r\": \"%s.%s\" must be a \"%s\" instance.' %\n\u001b[0;32m 338\u001b[0m (value, instance._meta.object_name,\n\u001b[1;32m--> 339\u001b[1;33m self.field.name, self.field.rel.to._meta.object_name))\n\u001b[0m\u001b[0;32m 340\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mvalue\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 341\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0minstance\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_state\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdb\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mValueError\u001b[0m: Cannot assign \"24\": \"Node.user\" must be a \"User\" instance."
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"node.children.all()"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
\ No newline at end of file
{
"metadata": {
"name": "",
"signature": "sha256:e0c3b2efe7c205a29dc4e028b10ffb7b9d0569f35c4b426febdf523069abffdb"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"from pprint import pprint\n",
"from node.models import Node, NodeType, Language, Ngram\n",
"from django.contrib.auth.models import User\n",
"import parsing\n",
"from parsing.FileParsers import *"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Define user\n",
"try:\n",
" user = User.objects.get(username='Mat')\n",
"except:\n",
" user = User(username='Mat', password='0123', email='mathieu@rodic.fr')\n",
" user.save()\n",
"\n",
"# Define document types\n",
"nodetypes = {}\n",
"for name in ['Corpus', 'Document']:\n",
" try:\n",
" nodetypes[name] = NodeType.objects.get(name=name)\n",
" except:\n",
" nodetypes[name] = NodeType(name=name)\n",
" nodetypes[name].save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"Node.objects.all().delete()\n",
"corpus = Node(name='PubMed corpus', user=user, type=nodetypes['Corpus'])\n",
"corpus.save()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser = PubmedFileParser.PubmedFileParser(file='/home/mat/projects/gargantext/data_samples/pubmed.zip')"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"fileparser.parse(corpus)\n",
"print('Ok!')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"Warning: parsing empty text\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"Warning: parsing empty text\n",
"Warning: parsing empty text"
]
}
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for node_ngram in corpus.children.first().node_ngram_set.all():\n",
" print(node_ngram.ngram.terms)"
],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
\ No newline at end of file
This diff is collapsed.
......@@ -23,9 +23,9 @@ PROJECT_PATH = os.path.abspath(PROJECT_PATH)
SECRET_KEY = 'bt)3n9v&a02cu7^^=+u_t2tmn8ex5fvx8$x4r*j*pb1yawd+rz'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = False
DEBUG = True
TEMPLATE_DEBUG = False
TEMPLATE_DEBUG = True
TEMPLATE_DIRS = (
......
Install the requirements
------------------------
1) Install all the Debian packages listed in dependances.deb
(also: sudo apt-get install postgresql-contrib)
2) Create a virtual enironnement with pyvenv: apt-get install python-virtualenv
3) Type: source [your virtual environment directory]/bin/activate
4) Do your work!
5) Type: deactivate
Configure stuff
---------------
1) ln -s [the project folder] /srv/gargantext
2) ln -s [your folder for tree tagger] [the project folder]/parsing/Tagger/treetagger
Warning: for ln, path has to be absolute!
In PostreSQL
-------------
1) Ensure postgres is started: sudo /etc/init.d/postgresql start
2) sudo su postgres
3) psql
4) CREATE USER alexandre WITH PASSWORD 'C8kdcUrAQy66U';
(see gargantext_web/settings.py, DATABASES = { ... })
5) CREATE DATABASE gargandb WITH OWNER alexandre;
6) Ctrl + D
7) psql gargandb
6) CREATE EXTENSION hstore;
7) Ctrl + D
Populate the database
---------------------
python manage.py syncdb
Start the Python Notebook server
--------------------------------
1) In Pyvenv: python manage.py shell_plus --notebook
2) Work from your browser!
Start the Django server
-----------------------
python manage.py runserver
\ No newline at end of file
......@@ -2,7 +2,6 @@
psql -d gargandb -f init.sql
sleep 2
./manage.py syncdb
......
import collections
from node.models import Node, NodeType, Language, Ngram, Node_Ngram
from parsing.NgramsExtractors import *
# This allows the fast retrieval of ngram ids
# from the cache instead of using the database for every call
class NgramCache:
"""
This allows the fast retrieval of ngram ids
from the cache instead of using the database for every call
"""
def __init__(self, language):
self._cache = dict()
......@@ -13,9 +16,9 @@ class NgramCache:
terms = terms.strip().lower()
if terms not in self._cache:
try:
ngram = NGram.get(terms=terms, language=self._language)
ngram = Ngram.get(terms=terms, language=self._language)
except:
ngram = NGram(terms=terms, n=len(terms), language=self._language)
ngram = Ngram(terms=terms, n=len(terms), language=self._language)
ngram.save()
self._cache[terms] = ngram
return self._cache[terms]
......@@ -43,12 +46,11 @@ class FileParser:
self._ngramcaches = NgramCaches()
# extractors
self._extractors = dict()
self._document_nodetype = NodeType.get(name='Document')
with Language.objects.all() as languages:
self._languages_iso2 = {language.iso2.lower(): language for language in Language}
self._languages_iso3 = {language.iso3.lower(): language for language in Language}
# ...and parse!
self.parse()
self._document_nodetype = NodeType.objects.get(name='Document')
languages = Language.objects.all()
self._languages_iso2 = {language.iso2.lower(): language for language in languages}
self._languages_iso3 = {language.iso3.lower(): language for language in languages}
#self.parse()
"""Extract the ngrams from a given text.
"""
......@@ -65,45 +67,54 @@ class FileParser:
extractor = self._extractors[language]
# Extract the ngrams
if extractor:
tokens = []
for ngram in extractor.extract_ngrams(text):
ngram_text = ' '.join([token for token, tag in ngram])
tokens.append(ngram_text)
return collections.Counter(
[token for token, tag in extractor.extract_ngrams(text)]
# [token for token, tag in extractor.extract_ngrams(text)]
tokens
)
else:
return dict()
#TODO
# * make it possible to tag and parse separately
# * only tags some data (only titles, titles & abstracts, some chapters...)
"""Add a document to the database.
"""
def create_document(self, parentNode, title, contents, language, metadata, guid=None):
# create or retrieve a resource for that document, based on its user id
if guid is None:
resource = Resource(guid=guid)
else:
try:
resource = Resource.get(guid=guid)
except:
resource = Resource(guid=guid)
# If the parent node already has a child with this resource, pass
# (is it a good thing?)
if parentNode.descendants().filter(resource=resource).exists():
return None
# if guid is None:
# resource = Resource(guid=guid)
# else:
# try:
# resource = Resource.get(guid=guid)
# except:
# resource = Resource(guid=guid)
# # If the parent node already has a child with this resource, pass
# # (is it a good thing?)
# if parentNode.descendants().filter(resource=resource).exists():
# return None
# create the document itself
childNode = Node(
user = parentNode.pk,
user = parentNode.user,
type = self._document_nodetype,
name = title,
language = language,
metadata = metadata,
resource = resource,
#resource = resource,
parent = parentNode
)
childNode.save()
# parse it!
ngrams = self.extract_ngrams(contents, language)
# we are already in a transaction, so no use doing another one (or is there?)
ngramcache = self._ngramcaches[language]
for terms, occurences in ngrams.items():
ngram_text = ' '.join([term[0] for term in terms])
ngram = ngramcache[ngram_text]
ngram = ngramcache[terms]
Node_Ngram(
node = childNode,
ngram = ngram,
......@@ -111,7 +122,7 @@ class FileParser:
).save()
# return the created document
return document
return childNode
"""Useful method to detect the document encoding.
Not sure it should be here actually.
......
from django.db import transaction
from lxml import etree
from parsing.FileParsers.FileParser import FileParser
from parsing.NgramsExtractors import *
import zipfile
import datetime
class PubmedFileParser(FileParser):
def parse(self, parentNode):
def parse(self, parentNode, tag=True):
# open the file as XML
xml_parser = etree.XMLParser(resolve_entities=False, recover=True)
xml = etree.parse(self._file, parser=xml_parser)
# parse all the articles, one by one
# all database operations should be performed within one transaction
xml_articles = xml.findall('PubmedArticle')
documents = []
with transaction.atomic():
for xml_article in xml_articles:
# extract data from the document
date_year = int(xml_article.find('MedlineCitation/DateCreated/Year').text)
date_month = int(xml_article.find('MedlineCitation/DateCreated/Month').text)
date_day = int(xml_article.find('MedlineCitation/DateCreated/Day').text)
metadata = {
# other metadata should also be included:
# authors, submission date, etc.
"date_pub": datetime.date(year, month, day),
"journal": xml_article.find('MedlineCitation/Article/Journal/Title').text,
"title": xml_article.find('MedlineCitation/Article/ArticleTitle').text,
"language_iso3": xml_article.find('MedlineCitation/Article/Language').text,
"doi": xml_article.find('PubmedData/ArticleIdList/ArticleId[type=doi]').text
}
contents = xml_article.find('MedlineCitation/Article/Abstract/AbstractText').text
# create the document in the database
yield self.create_document(
parentNode = parentNode,
document = self.create_document(
parentNode = parentNode,
title = metadata["title"],
contents = contents,
language = self._languages_iso3[metadata["language"].lower()],
metadata = metadata,
guid = metadata["doi"],
)
if document:
documents.append(document)
with zipfile.ZipFile(self._file) as zipFile:
for filename in zipFile.namelist():
file = zipFile.open(filename, "r")
# print(file.read())
xml = etree.parse(file, parser=xml_parser)
# parse all the articles, one by one
# all database operations should be performed within one transaction
xml_articles = xml.findall('PubmedArticle')
for xml_article in xml_articles:
# extract data from the document
date_year = int(xml_article.find('MedlineCitation/DateCreated/Year').text)
date_month = int(xml_article.find('MedlineCitation/DateCreated/Month').text)
date_day = int(xml_article.find('MedlineCitation/DateCreated/Day').text)
metadata = {
"date_pub": '%s-%s-%s' % (date_year, date_month, date_day),
}
metadata_path = {
"journal" : 'MedlineCitation/Article/Journal/Title',
"title" : 'MedlineCitation/Article/ArticleTitle',
"language_iso3" : 'MedlineCitation/Article/Language',
"doi" : 'PubmedData/ArticleIdList/ArticleId[type=doi]',
"abstract" : 'MedlineCitation/Article/Abstract/AbstractText'
}
for key, path in metadata_path.items():
try:
node = xml_article.find(path)
metadata[key] = node.text
except:
metadata[key] = ""
contents = metadata["abstract"]
# create the document in the database
document = self.create_document(
parentNode = parentNode,
title = metadata["title"],
contents = contents,
language = self._languages_iso3[metadata["language_iso3"].lower()],
metadata = metadata,
#guid = metadata["doi"],
)
if document:
documents.append(document)
return documents
from NgramsExtractors.NgramsExtractor import NgramsExtractor
from Taggers import NltkTagger
from parsing.NgramsExtractors.NgramsExtractor import NgramsExtractor
from parsing.Taggers import NltkTagger
class EnglishNgramsExtractor(NgramsExtractor):
......
from NgramsExtractors.NgramsExtractor import NgramsExtractor
from Taggers import TreeTagger
from parsing.NgramsExtractors.NgramsExtractor import NgramsExtractor
from parsing.Taggers import TreeTagger
class FrenchNgramsExtractor(NgramsExtractor):
......
from Taggers import Tagger
from parsing.Taggers import Tagger
import nltk
......@@ -17,9 +17,8 @@ class NgramsExtractor:
def __del__(self):
self.stop()
def start(self):
self.tagger = Tagger
self.tagger = Tagger()
def stop(self):
pass
......@@ -40,7 +39,7 @@ class NgramsExtractor:
except:
print("Problem while parsing rule '%s'" % (self._rule, ))
pass
return iter(result)
return result
\ No newline at end of file
from NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor
from NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
\ No newline at end of file
#from NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor
#from NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
from parsing.NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor
from parsing.NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
from Taggers.Tagger import Tagger
from parsing.Taggers.Tagger import Tagger
import nltk
......
from Taggers.Tagger import Tagger
from parsing.Taggers.Tagger import Tagger
import subprocess
import threading
......
from Taggers.NltkTagger import NltkTagger
from Taggers.TreeTagger import TreeTagger
from parsing.Taggers.NltkTagger import NltkTagger
from parsing.Taggers.TreeTagger import TreeTagger
......@@ -2,21 +2,21 @@ from NgramsExtractors import *
from Taggers import *
#texts = [
# "This is quite a simple test.",
# "Forman Brown (1901–1996) was one of the world's leaders in puppet theatre in his day, as well as an important early gay novelist. He was a member of the Yale Puppeteers and the driving force behind Turnabout Theatre. He was born in Otsego, Michigan, in 1901 and died in 1996, two days after his 95th birthday. Brown briefly taught at North Carolina State College, followed by an extensive tour of Europe.",
# "James Patrick (born c. 1940) is the pseudonym of a Scottish sociologist, which he used to publish a book A Glasgow Gang Observed. It attracted some attention in Scotland when it was published in 1973. It was based on research he had done in 1966, when he was aged 26. At that time he was working as a teacher in an Approved School, a Scottish reformatory. One gang member in the school, \"Tim Malloy\" (born 1950, also a pseudonym and a generic term for a Glasgow Catholic), agreed to infiltrate him into his gang in Maryhill in Glasgow. Patrick spent four months as a gang member, observing their behaviour.",
#]
#tagger = NltkTagger()
#extractor = EnglishNgramsExtractor()
#
texts = [
"This is quite a simple test.",
"Forman Brown (1901–1996) was one of the world's leaders in puppet theatre in his day, as well as an important early gay novelist. He was a member of the Yale Puppeteers and the driving force behind Turnabout Theatre. He was born in Otsego, Michigan, in 1901 and died in 1996, two days after his 95th birthday. Brown briefly taught at North Carolina State College, followed by an extensive tour of Europe.",
"James Patrick (born c. 1940) is the pseudonym of a Scottish sociologist, which he used to publish a book A Glasgow Gang Observed. It attracted some attention in Scotland when it was published in 1973. It was based on research he had done in 1966, when he was aged 26. At that time he was working as a teacher in an Approved School, a Scottish reformatory. One gang member in the school, \"Tim Malloy\" (born 1950, also a pseudonym and a generic term for a Glasgow Catholic), agreed to infiltrate him into his gang in Maryhill in Glasgow. Patrick spent four months as a gang member, observing their behaviour.",
"La saison 1921-1922 du Foot-Ball Club Juventus est la vingtième de l'histoire du club, créé vingt-cinq ans plus tôt en 1897. La société turinoise qui fête cette année son 25e anniversaire prend part à l'édition du championnat dissident d'Italie de la CCI (appelé alors la Première division), la dernière édition d'une compétition annuelle de football avant l'ère fasciste de Mussolini.",
"Le terme oblong désigne une forme qui est plus longue que large et dont les angles sont arrondis. En langage bibliographique, oblong signifie un format dont la largeur excède la hauteur. Ce qui correspond au format paysage en termes informatiques et \"à l'italienne\", pour l'imprimerie.",
"Les sanglots longs des violons de l'automne bercent mon coeur d'une langueur monotone.",
]
tagger = NltkTagger()
extractor = EnglishNgramsExtractor()
# texts = [
# "La saison 1921-1922 du Foot-Ball Club Juventus est la vingtième de l'histoire du club, créé vingt-cinq ans plus tôt en 1897. La société turinoise qui fête cette année son 25e anniversaire prend part à l'édition du championnat dissident d'Italie de la CCI (appelé alors la Première division), la dernière édition d'une compétition annuelle de football avant l'ère fasciste de Mussolini.",
# "Le terme oblong désigne une forme qui est plus longue que large et dont les angles sont arrondis. En langage bibliographique, oblong signifie un format dont la largeur excède la hauteur. Ce qui correspond au format paysage en termes informatiques et \"à l'italienne\", pour l'imprimerie.",
# "Les sanglots longs des violons de l'automne bercent mon coeur d'une langueur monotone.",
# ]
# tagger = TreeTagger()
# extractor = FrenchNgramsExtractor()
tagger = TreeTagger()
extractor = FrenchNgramsExtractor()
for text in texts:
......@@ -25,4 +25,4 @@ for text in texts:
ngrams = extractor.extract_ngrams(text)
for ngram in ngrams:
print("\t" + str(ngram))
print("\n")
\ No newline at end of file
print("\n")
......@@ -6,4 +6,4 @@ node = Node.objects.get(name="PubMed corpus")
parser = parsing.Parser()
parser.parse_node_fichier(node)
#parser.parse_node_fichier(node)
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment