Commit 0d98962c authored by Mathieu Rodic's avatar Mathieu Rodic

The ngrams extractors are working, both in English and in French!

See mat-parsing/test.py for more info about how it works.
parent 4394900c
from NgramsExtractor import NgramsExtractor
from NgramsExtractors.NgramsExtractor import NgramsExtractor
from Taggers import NltkTagger
class EnglishNgramsExtractor(NgramsExtractor):
pass
\ No newline at end of file
def start(self):
self.tagger = NltkTagger()
from NgramsExtractor import NgramsExtractor
from NgramsExtractors.NgramsExtractor import NgramsExtractor
from Taggers import TreeTagger
class FrenchNgramsExtractor(NgramsExtractor):
pass
\ No newline at end of file
def start(self):
self.tagger = TreeTagger()
from Taggers import Tagger
import nltk
"""Base class for all ngrams extractors.
......@@ -7,17 +9,46 @@ class NgramsExtractor:
"""Class instanciation.
This method can be overriden.
"""
def __init__(self):
def __init__(self, rule="NP: {<JJ.*>*<NN.*|>+<JJ.*>*}"):
self.start()
self._rule = rule
def __del__(self):
self.stop()
def start(self):
self.tagger = Tagger
def stop(self):
pass
def tag_ngrams(self, contents):
return []
"""Extracts a list of ngrams.
Returns a list of the ngrams found in the given text.
"""
def extract_ngrams(self, contents):
tagged_ngrams = self.tag_ngrams()
tagged_ngrams = self.tagger.tag_text(contents)
grammar = nltk.RegexpParser(self._rule)
result = []
try:
grammar_parsed = grammar.parse(tagged_ngrams)
grammar_parsed_iterator = grammar_parsed.subtrees()
while True:
try:
subtree = next(grammar_parsed_iterator)
if subtree.label() == 'NP':
#print(subtree.label())
result.append(subtree.leaves())
except Exception as e:
break
except Exception as e:
print(e)
pass
return iter(result)
\ No newline at end of file
from FrenchNgramsExtractor import FrenchNgramsExtractor
from EnglishNgramsExtractor import EnglishNgramsExtractor
\ No newline at end of file
from NgramsExtractors.FrenchNgramsExtractor import FrenchNgramsExtractor
from NgramsExtractors.EnglishNgramsExtractor import EnglishNgramsExtractor
\ No newline at end of file
from Tagger import Tagger
from Taggers.Tagger import Tagger
import nltk
......
from Tagger import Tagger
from Taggers.Tagger import Tagger
import subprocess
import threading
......@@ -46,7 +46,7 @@ Shall be used for french texts.
"""
class TreeTagger(Tagger):
def start(self, treeTaggerPath = "../../../nlp/pythonwrapperP3/treetagger"):
def start(self, treeTaggerPath = "./Taggers/treetagger"):
binaryFile = "%s/bin/tree-tagger" % treeTaggerPath
tagcmdlist = [
binaryFile,
......@@ -70,9 +70,11 @@ class TreeTagger(Tagger):
def stop(self):
# terminates the 'treetagger' process
self._popen.kill()
self._popen.terminate()
try:
self._popen.kill()
self._popen.terminate()
except:
pass
def tagging_start(self):
self.buffer = []
......
from NltkTagger import NltkTagger
from TreeTagger import TreeTagger
\ No newline at end of file
from Taggers.NltkTagger import NltkTagger
from Taggers.TreeTagger import TreeTagger
\ No newline at end of file
# from NltkTagger import NltkTagger
# tagger = NltkTagger()
# text0 = "Forman Brown (1901–1996) was one of the world's leaders in puppet theatre in his day, as well as an important early gay novelist. He was a member of the Yale Puppeteers and the driving force behind Turnabout Theatre. He was born in Otsego, Michigan, in 1901 and died in 1996, two days after his 95th birthday. Brown briefly taught at North Carolina State College, followed by an extensive tour of Europe."
# text1 = "James Patrick (born c. 1940) is the pseudonym of a Scottish sociologist, which he used to publish a book A Glasgow Gang Observed. It attracted some attention in Scotland when it was published in 1973. It was based on research he had done in 1966, when he was aged 26. At that time he was working as a teacher in an Approved School, a Scottish reformatory. One gang member in the school, \"Tim Malloy\" (born 1950, also a pseudonym and a generic term for a Glasgow Catholic), agreed to infiltrate him into his gang in Maryhill in Glasgow. Patrick spent four months as a gang member, observing their behaviour."
from NltkTagger import NltkTagger
tagger = NltkTagger()
text0 = "Forman Brown (1901–1996) was one of the world's leaders in puppet theatre in his day, as well as an important early gay novelist. He was a member of the Yale Puppeteers and the driving force behind Turnabout Theatre. He was born in Otsego, Michigan, in 1901 and died in 1996, two days after his 95th birthday. Brown briefly taught at North Carolina State College, followed by an extensive tour of Europe."
text1 = "James Patrick (born c. 1940) is the pseudonym of a Scottish sociologist, which he used to publish a book A Glasgow Gang Observed. It attracted some attention in Scotland when it was published in 1973. It was based on research he had done in 1966, when he was aged 26. At that time he was working as a teacher in an Approved School, a Scottish reformatory. One gang member in the school, \"Tim Malloy\" (born 1950, also a pseudonym and a generic term for a Glasgow Catholic), agreed to infiltrate him into his gang in Maryhill in Glasgow. Patrick spent four months as a gang member, observing their behaviour."
# from TreeTagger import TreeTagger
# tagger = TreeTagger()
# text0 = "La saison 1921-1922 du Foot-Ball Club Juventus est la vingtième de l'histoire du club, créé vingt-cinq ans plus tôt en 1897. La société turinoise qui fête cette année son 25e anniversaire prend part à l'édition du championnat dissident d'Italie de la CCI (appelé alors la Première division), la dernière édition d'une compétition annuelle de football avant l'ère fasciste de Mussolini."
# text1 = "Le terme oblong désigne une forme qui est plus longue que large et dont les angles sont arrondis. En langage bibliographique, oblong signifie un format dont la largeur excède la hauteur. Ce qui correspond au format paysage en termes informatiques et \"à l'italienne\", pour l'imprimerie."
from TreeTagger import TreeTagger
tagger = TreeTagger()
text0 = "La saison 1921-1922 du Foot-Ball Club Juventus est la vingtième de l'histoire du club, créé vingt-cinq ans plus tôt en 1897. La société turinoise qui fête cette année son 25e anniversaire prend part à l'édition du championnat dissident d'Italie de la CCI (appelé alors la Première division), la dernière édition d'une compétition annuelle de football avant l'ère fasciste de Mussolini."
text1 = "Le terme oblong désigne une forme qui est plus longue que large et dont les angles sont arrondis. En langage bibliographique, oblong signifie un format dont la largeur excède la hauteur. Ce qui correspond au format paysage en termes informatiques et \"à l'italienne\", pour l'imprimerie."
text2 = "Les sanglots longs des violons de l'automne bercent mon coeur d'une langueur monotone."
print()
print(tagger.tag_text(text0))
print()
print(tagger.tag_text(text1))
print()
print(tagger.tag_text(text2))
print()
\ No newline at end of file
from Taggers import *
from NgramsExtractors import *
from FileParsers import *
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment