Commit 298679e1 authored by delanoe's avatar delanoe

Merge remote-tracking branch 'origin/romain-testing-taggers-parsers' into testing-merge

parents 427c107d 5daefcf6
"""
Creates MD5 hashes (used for unique filepaths)
NB this module could go inside util.files
"""
import hashlib
import binascii
def digest(value, algorithm='md5'):
"""
Ex: b'm\x00\x07\xe5/z\xfb}Z\x06P\xb0\xff\xb8\xa4\xd1'
(16 bytes ranging from 0 to ff)
"""
m = hashlib.new(algorithm)
m.update(value)
return m.digest()
def str_digest(value, algorithm='md5'):
"""
Ex: 6d0007e52f7afb7d5a0650b0ffb8a4d1
(32 hex chars)
"""
return binascii.hexlify(digest(value, algorithm)).decode()
......@@ -4,8 +4,8 @@
"""
__author__ = "Gargantext Team"
__copyright__ = "Copyright 2014-15 ISCPIF-CNRS"
__version__ = "0.1"
__copyright__ = "Copyright 2014-16 ISCPIF-CNRS"
__version__ = "0.2"
__email__ = "romain.loth@iscpif.fr"
__status__ = "Test"
......@@ -27,6 +27,7 @@ import sys
#from admin.env import *
#from parsing.FileParsers.FileParser import FileParser
from ._Parser import Parser
from traceback import format_tb
class EuropresseParser(Parser):
......@@ -112,15 +113,15 @@ class EuropresseParser(Parser):
# parse all the articles, one by one
try:
for html_article in html_articles:
print("==============================new article")
for html_article in html_articles:
try:
# s'il n'y a pas du tout de header on doit skip
all_header = html_article.xpath(entire_header_xpath)
if len(all_header) == 0:
print("WARNING: europress (skip) article without header")
all_header_text = " ".join(scrap_text(all_header))
if len(all_header) == 0 or len(all_header_text) == 0:
hyperdata['error']="Europresse: html doc with no header"
yield(hyperdata)
print("WARNING: europresse (skip) article without header")
continue
hyperdata = {}
......@@ -134,7 +135,9 @@ class EuropresseParser(Parser):
hyperdata['title'] = title[0]
except:
# il y aura un problème d'affichage si pas de titre !
print("WARNING: europress (skip) article without title")
print("WARNING: europresse (skip) article without title")
hyperdata['error']="Europresse: doc with no title"
yield(hyperdata)
continue
......@@ -189,12 +192,19 @@ class EuropresseParser(Parser):
else:
# occasionellment DocHeader absent
# (on se rabat sur le header entier)
search_text = " ".join(scrap_text(all_header[0]))
search_text = all_header_text
# print("---using all header: '%s'" % search_text)
# si on n'a pas trouvé de zone du tout
if not search_text:
the_err = "europresse (skip) doc without detailed header"
print("WARNING:" + the_err)
hyperdata['error']= the_err
yield(hyperdata)
continue
# on poursuit date/langue avec la zone obtenue
# on poursuit date/langue avec la zone obtenue...
# 1) Une REGEXP identifie la langue ET attrape la date
test_date_fr = re.search(format_date_fr,search_text)
......@@ -223,7 +233,7 @@ class EuropresseParser(Parser):
# match str
date_str = test_date_en.group()
else:
print("WARNING europress: echec diagnostic date/langue header sur '%s'" % header)
print("WARNING europresse: echec diagnostic date/langue header sur '%s'" % header)
# default lg value, used locally, not saved
doc_language = 'en'
# default date value, will be saved
......@@ -260,8 +270,12 @@ class EuropresseParser(Parser):
# most probably news_topic before beginning of date
hyperdata['rubrique'] = header_elts[0]
print(hyperdata)
# print(hyperdata)
yield hyperdata
except Exception as err:
print('Europresse parser: Something bad happened:' + str(err))
except Exception as err:
print('WARNING: europresse (skip) unknown error:"' + str(err) + '"'
+ "\n>>>" + (">>>".join(format_tb(err.__traceback__))))
hyperdata['error']= err
yield(hyperdata)
continue
......@@ -27,7 +27,6 @@ class ISTexParser(Parser):
}
suma = 0
print(len(json_docs))
for json_doc in json_docs:
hyperdata = {}
......
from ._Parser import Parser
from gargantext.util.languages import languages
from re import match
class RISParser(Parser):
......@@ -11,11 +12,15 @@ class RISParser(Parser):
"AU": {"type": "hyperdata", "key": "authors", "separator": "\n"},
"T2": {"type": "hyperdata", "key": "journal"},
"UR": {"type": "hyperdata", "key": "doi"},
# RIS format specifications: PY is not only year but YYYY/MM/DD with MM and DD optional
# cf. https://en.wikipedia.org/wiki/RIS_(file_format)
"PY": {"type": "hyperdata", "key": "publication_year"},
"PD": {"type": "hyperdata", "key": "publication_month"},
"N1": {"type": "hyperdata", "key": "references", "separator": ", "},
"LA": {"type": "hyperdata", "key": "language_iso2"},
"A": {"type": "hyperdata", "key": "abstract", "separator": " "},
"AB": {"type": "hyperdata", "key": "abstract", "separator": " "},
"WC": {"type": "hyperdata", "key": "fields"},
}
......@@ -44,7 +49,11 @@ class RISParser(Parser):
# 1 - we record the previous value array...
if parameter["type"] == "hyperdata":
separator = parameter["separator"] if "separator" in parameter else ""
hyperdata[parameter["key"]] = separator.join(last_values)
final_value = separator.join(last_values)
if last_key != 'PY':
hyperdata[parameter["key"]] = final_value
else:
hyperdata = PY_values_decompose_and_save(final_value, hyperdata)
#... or even finish the record (rare here, most often after empty line)
elif parameter["type"] == "delimiter":
......@@ -83,7 +92,11 @@ class RISParser(Parser):
parameter = self._parameters[last_key]
if parameter["type"] == "hyperdata":
separator = parameter["separator"] if "separator" in parameter else ""
hyperdata[parameter["key"]] = separator.join(last_values)
final_value = separator.join(last_values)
if last_key != 'PY':
hyperdata[parameter["key"]] = final_value
else:
hyperdata = PY_values_decompose_and_save(final_value, hyperdata)
# if a hyperdata object is left in memory, yield it as well
if hyperdata:
......@@ -92,3 +105,51 @@ class RISParser(Parser):
if 'language_iso2' not in hyperdata.keys():
hyperdata['language_iso2'] = 'en'
yield hyperdata
# helper function for PY dates
def PY_values_decompose_and_save(ris_date_str, hyperdata):
"""
PY is associated to our publication_year, but the exact format is:
"YYYY/MM/DD/" (with MM and DD optional)
exemple contents:
1948/07/01
1948/07/01/
1948/07//
1948//
1948
=> This function does the necessary additional date subparsing
and saves the results in the 3 hyperdata slots: year, month, day
"""
possible_fields = ['publication_year',
'publication_month',
'publication_day',
None]
current_field_i = 0
buffr = ""
for char in ris_date_str:
if char != '/':
# continue reading
buffr += char
else:
# on '/' => we save and shift to next field
current_field = possible_fields[current_field_i]
if len(buffr):
hyperdata[current_field] = buffr
# prepare for next time
current_field_i += 1
buffr = ""
# save at the end too
current_field = possible_fields[current_field_i]
if len(buffr):
hyperdata[current_field] = buffr
# return updated meta
return hyperdata
......@@ -57,9 +57,15 @@ class Parser:
elif hyperdata.get('publication_year', None) is not None:
prefixes = [key[:-5] for key in hyperdata.keys() if key[-5:] == "_year"]
# eg prefixes : ['publication']
for prefix in prefixes:
date_string = hyperdata[prefix + "_year"]
# FIXME: except for year is it necessary to test that key exists
# when we have a default value in .get(key, "01") ??
key = prefix + "_month"
if key in hyperdata:
date_string += " " + hyperdata.get(key, "01")
......@@ -79,19 +85,19 @@ class Parser:
hyperdata[prefix + "_date"] = dateutil.parser.parse(date_string).strftime("%Y-%m-%d %H:%M:%S")
except Exception as error:
try:
print(error, date_string)
print("_Parser: error in full date parse", error, date_string)
# Date format: 1994 NOV-DEC
hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:8]).strftime("%Y-%m-%d %H:%M:%S")
except Exception as error:
try:
print("error line 93", error)
print("_Parser: error in short date parse", error)
# FIXME Date format: 1994 SPR
# By default, we take the year only
hyperdata[prefix + "_date"] = date_parser.parse(str(date_string)[:4]).strftime("%Y-%m-%d %H:%M:%S")
except Exception as error:
print("error line 99", error)
print("_Parser:", error)
else:
print("WARNING: Date unknown at _Parser level, using now()")
hyperdata['publication_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
......
from .MeltTagger import MeltTagger, _tag_replacements
class EnglishMeltTagger(MeltTagger):
def __init__(self, *args, **kwargs):
MeltTagger.__init__(self, *args, **kwargs)
self.language = 'en'
self._tag_replacements = _tag_replacements['en']
from .MeltTagger import MeltTagger, _tag_replacements
class FrenchMeltTagger(MeltTagger):
def __init__(self, *args, **kwargs):
MeltTagger.__init__(self, *args, **kwargs)
self.language = 'fr'
self._tag_replacements = _tag_replacements['fr']
......@@ -55,7 +55,7 @@ class MeltTagger(Tagger):
def __init__(self, *args, **kwargs):
self.language = kwargs.pop('language', 'fr')
self._tag_replacements = _tag_replacements[self.language]
super(self.__class__, self).__init__(*args, **kwargs)
Tagger.__init__(self, *args, **kwargs)
def start(self, melt_data_path='lib/melttagger'):
language = self.language
......@@ -102,7 +102,8 @@ class MeltTagger(Tagger):
if len(token.string):
yield (token.string, token.label, )
def extract(self, text, lemmatize=False):
def tag_text(self, text, lemmatize=False):
# print("IN MeltTagger.tag_text()")
tagged_tokens = self._tag(text)
if not lemmatize:
# without lemmatization
......@@ -123,10 +124,14 @@ class MeltTagger(Tagger):
yield (values[0], self._tag_replacements[values[1]], values[2].replace('*', ''))
def EnglishMeltTagger(*args, **kwargs):
kwargs['language'] = 'en'
return MeltTagger(*args, **kwargs)
# 2016-09-02: these two constructors go outside
# to respect the new tagger import
# mecanism (1 tagger <=> 1 module)
def FrenchMeltTagger(*args, **kwargs):
kwargs['language'] = 'fr'
return MeltTagger(*args, **kwargs)
# def EnglishMeltTagger(*args, **kwargs):
# kwargs['language'] = 'en'
# return MeltTagger(*args, **kwargs)
#
# def FrenchMeltTagger(*args, **kwargs):
# kwargs['language'] = 'fr'
# return MeltTagger(*args, **kwargs)
......@@ -19,8 +19,7 @@ class Tagger:
| [][.,;"'?!():-_`] # these are separate tokens
''', re.UNICODE | re.MULTILINE | re.DOTALL)
self.buffer = []
#self.start()
self.start()
def clean_text(self, text):
......@@ -33,6 +32,7 @@ class Tagger:
self.text = self.clean_text(text)
grammar = nltk.RegexpParser(label + ': ' + rule)
tagged_tokens = list(self.tag_text(self.text))
# print("the tagged_tokens", tagged_tokens)
if len(tagged_tokens):
grammar_parsed = grammar.parse(tagged_tokens)
for subtree in grammar_parsed.subtrees():
......@@ -46,12 +46,11 @@ class Tagger:
self.stop()
def start(self):
"""Initializes the tagger.
"""Initializes the tagger (once per corpus).
This method is called by the constructor, and can be overriden by
inherited classes.
"""
print("START")
self.extract(self.text)
# print("START")
def stop(self):
"""Ends the tagger.
......@@ -81,7 +80,7 @@ class Tagger:
return []
# Not used right now
# Main function for extract()
def tag_text(self, text):
"""Send a text to be tagged.
"""
......
#!/bin/bash
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/srv/gargantext_lib/taggers/nlpserver/TurboParser/deps/local/lib:"
if [[ "$VIRTUAL_ENV" != "" ]]
then
source /srv/env_3-5/bin/activate
fi
export LD_LIBRARY_PATH=":/srv/gargantext_lib/taggers/nlpserver/TurboParser/deps/local/lib:"
source /srv/env_3-5/bin/activate
python server.py
python3 /srv/gargantext/gargantext/util/taggers/lib/nlpserver/server.py
......@@ -11,7 +11,8 @@ def _integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor):
£TODO: load whole word dictionary in ram and check existence before inserting to db => sequential insert => probably faster!
"""
print('INTEGRATE', len(ngrams_data), len(nodes_ngrams_count))
# print('INTEGRATE', len(ngrams_data), len(nodes_ngrams_count))
print('INTEGRATE')
# integrate ngrams (aka new words)
ngrams_ids = bulk_insert_ifnotexists(
model = Ngram, # todo type should :str ~~> :str|:re) !!!
......@@ -118,7 +119,7 @@ def extract_ngrams(corpus, keys=DEFAULT_INDEX_FIELDS, do_subngrams = DEFAULT_IND
# integrate ngrams and nodes-ngrams
if len(nodes_ngrams_count) >= BATCH_NGRAMSEXTRACTION_SIZE:
print(len(nodes_ngrams_count),">=", BATCH_NGRAMSEXTRACTION_SIZE)
# print(len(nodes_ngrams_count),">=", BATCH_NGRAMSEXTRACTION_SIZE)
_integrate_associations(nodes_ngrams_count, ngrams_data, db, cursor)
nodes_ngrams_count.clear()
ngrams_data.clear()
......
......@@ -68,7 +68,8 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
lang_result['skipped'].append(hyperdata["language_name"])
else:
print("[WARNING] no language_* found in document [parsing.py]")
# print("WARNING no language_* found in document [parsing.py] => "
# + ("(detecting)" if DETECT_LANG else "(using default)"))
if DETECT_LANG:
#no language have been indexed
......@@ -93,7 +94,7 @@ def add_lang(hyperdata, observed_languages, skipped_languages):
def parse(corpus):
try:
print("PARSING")
print("DETECT_LANG?", DETECT_LANG)
# print("DETECT_LANG?", DETECT_LANG)
corpus.status('Docs', progress=0)
#1 corpus => 1 or multi resources.path (for crawlers)
resources = corpus.resources()
......@@ -107,7 +108,9 @@ def parse(corpus):
#corpus.status(error)
raise ValueError("Resource '%s' has no Parser" %resource["name"])
parserbot = load_parser(source)
print(parserbot)
# print(parserbot)
#observed languages in default languages
observed_languages = []
#skipped_languages
......@@ -218,10 +221,10 @@ def parse(corpus):
#les jolis iso2
observed_langs = dict(Counter(observed_languages))
print("#LANGAGES OK")
print(observed_langs)
print("#LANGUAGES UNKNOWN")
print(skipped_langs)
# print("#LANGAGES OK")
# print(observed_langs)
# print("#LANGUAGES UNKNOWN")
# print(skipped_langs)
top_langs = sorted(observed_langs.items(), key = lambda x: x[1], reverse=True)
if len(top_langs) > 0:
......
......@@ -32,10 +32,15 @@ Tests
5. ** tests ??? **
6. ** tests ??? **
7. **tests_070_routes**
Checks the response types from the app url routes:
- "/"
- "/api/nodes"
- "/api/nodes/<ID>"
Checks the response types from the app url routes:
- "/"
- "/api/nodes"
- "/api/nodes/<ID>"
8. ** tests users ??? **
9. **tests_090_toolchain**
Checks each data source parserbot (CSV, Pubmed, Zotero, Istex, etc.)
- correct parsing for a small sample
GargTestRunner
......@@ -109,4 +114,3 @@ class MyTestRecipes(TestCase):
self.assertEqual(the_response.status_code, 200)
```
*Si vous aimez les aventures de Peter Corser, lisez l'album précédent ["Doors"](https://gogs.iscpif.fr/leclaire/doors)* (Scénario M. Leclaire, Dessins R. Loth) (disponible dans toutes les bonnes librairies)
......@@ -32,7 +32,7 @@ class ToolChainRecipes(TestCase):
def setUp(self):
#self.session = GargTestRunner.testdb_session
self.session = session
self.log= logging.getLogger( "SomeTest.testSomething" )
self.log= logging.getLogger( "unitests.test_090_toolchain" )
self.client = Client()
self.user = User()
self.project = self._create_project()
......@@ -40,34 +40,27 @@ class ToolChainRecipes(TestCase):
self.source_list.insert(0, (0,"Select a database below"))
self.sample_files = self._collect_samples_files()
def tearDown(self):
#del self.session
del self.client
#del self.factory
del self.source_list
del self.sample_files
del self.project
def _create_project(self):
self.project = Node(
project = Node(
user_id = self.user.id,
typename = 'PROJECT',
name = "test1000",
)
self.session.add(self.project)
self.session.add(project)
self.session.commit()
return self.project
return project
def __count_node_children__(self, CurrNode, typename=None):
'''find ALL the children of a given Node [optionnal filter TYPENAME] '''
'''count ALL the children of a given Node [optional filter TYPENAME] '''
if typename is None:
self.children = CurrNode.children('', order=True).count()
children = CurrNode.children('').count()
else:
self.children = CurrNode.children(typename, order=True).count()
return self.children
children = CurrNode.children(typename).count()
return children
def __find_node_parent__(self, CurrNode):
'''find the parent Node given a CurrNode '''
self.parent = self.session.query(Node).filter(Node.id == Node.parent_id, Node.name == name).first()
self.parent = self.session.query(Node).filter(Node.id == CurrNode.parent_id).first()
def _collect_samples_files(self):
from collections import defaultdict
......@@ -79,22 +72,24 @@ class ToolChainRecipes(TestCase):
for format_source in os.listdir(DATA_SAMPLE_DIR):
#self.log.debug(format_source)
full_path = join(DATA_SAMPLE_DIR, format_source)
if not os.path.isfile(full_path):
if not isfile(full_path):
if format_source in sources:
self.sample_files[format_source] = [join(full_path, samplef) for samplef in os.listdir(full_path)]
return self.sample_files
def _create_corpus(self,name, source_type, sample_file):
self.corpus = self.project.add_child(
corpus = self.project.add_child(
name = name,
typename = 'CORPUS',
)
self.corpus.add_resource(
corpus.add_resource(
type = int(source_type),
path = sample_file,
)
self.session.add(self.corpus)
self.session.add(corpus)
self.session.commit()
return self.corpus
return corpus
def _get_corpus(self, name):
corpus = self.session.query(Node).filter(Node.typename == "CORPUS", Node.name == name).first()
return corpus
......@@ -104,7 +99,7 @@ class ToolChainRecipes(TestCase):
Each of the resources input test can follow this common recipe base
@param source_type: int (cf. constants.py RESOURCETYPES)
@param expected_results: []int (number of docs for each sample corpora of this source)
@param expected_results: int[] (number of docs for each sample corpora of this source)
"""
source = get_resource(source_type)
source_name = source["name"].split("[")[0].lower().strip().replace(" ", "_")
......@@ -168,8 +163,3 @@ class ToolChainRecipes(TestCase):
def tests_010(self):
self._run_recipe(10, DATA_SAMPLE_NDOCS[10])
if __name__ == "__main__":
logging.basicConfig( stream=sys.stderr )
logging.getLogger( "unitests.test_090_toolchain" ).setLevel( logging.DEBUG )
unittest.main()
"""
API UNIT TESTS
================
"""
from django.test import TestCase, Client
from gargantext.models import Node
from gargantext.util.db import session
from rest_framework.test import APIClient
from rest_framework.test import APIRequestFactory
# Using the standard RequestFactory API to create a form POST request
#factory = APIRequestFactory()
class APIRecipe(TestCase):
def setUp(self):
"""
Will be run before each test
"""
self.client = Client()
# login with our fake user
response = self.client.post(
'/auth/login/',
{'username': 'pcorser', 'password': 'peter'}
)
self.create_project()
self.create_corpus()
self.factory = APIRequestFactory()
def create_project(self):
new_project = Node(
typename = 'PROJECT',
name = "My project",
)
session.add(new_project)
session.commit()
self.project = new_project
def create_corpus(self):
#create a default corpus
self.corpus = self.project.add_child(
name = "My Corpus",
typename = 'CORPUS',
)
session.add(self.corpus)
session.commit()
def test_001_post_project(self):
'''POST /projects'''
request = self.factory.post('/api/projects/', {'name': 'PROJECT TEST'}, format='json')
def test_002_get_projects(self):
'''GET /projects'''
request = self.factory.get('/api/projects/', format='json')
def test_003_put_projects(self):
'''PUT /projects'''
request = self.factory.put('/api/projects/', {"name": "My TEST PROJECT"}, format='json')
def test_004_delete_projects(self):
'''DELETE /projects'''
request = self.factory.delete('/api/projects/', format='json')
def test_005_delete_project(self):
'''DELETE /project'''
request = self.factory.delete('/api/project/%s' %self.project.id, format='json')
def test_006_get_project(self):
'''GET /PROJECT'''
request = self.factory.get('/api/project/%s' %self.project.id, format='json')
def test_007_put_project(self):
''' PUT /PROJECT '''
request = self.factory.put('/api/project/%s' %self.project.id, {"name": "My New Project"}, format='json')
# def test_008_post_corpus(self):
# '''POST /project'''
# request = self.factory.post('/project/', {'name': 'PROJECT TEST'})
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment