Commit 2754c5ea authored by sim's avatar sim

Add language detection and naming utilities

parent 4695d417
......@@ -10,22 +10,25 @@ pylint-django = "*"
colorama = "*"
[packages]
Django = "*"
django = "*"
dateutils = "*"
celery = "*"
SQLAlchemy = "*"
sqlalchemy = "*"
"psycopg2-binary" = "*"
SQLAlchemy-Utils = "*"
sqlalchemy-utils = "*"
djangorestframework = "*"
djangorestframework-jwt = "*"
django_celery_beat = "*"
django-celery-beat = "*"
python-decouple = "*"
alembic = "*"
Scrapy = "*"
scrapy = "*"
jmespath = "*"
RISparser = "*"
risparser = "*"
scrapyd = "*"
scrapyd-client = "*"
pycountry = "*"
langid = "*"
langdetect = "*"
[requires]
python_version = "3.5"
{
"_meta": {
"hash": {
"sha256": "d1ef83cc88284a8955903830c84c3fa2883ec47ca779e36a9b7309d701d41d02"
"sha256": "93ecedebddb075cbd51f77edbe16c8457910d19320101f2a90165b15993637fe"
},
"pipfile-spec": 6,
"requires": {
......@@ -217,6 +217,20 @@
],
"version": "==4.1.0"
},
"langdetect": {
"hashes": [
"sha256:91a170d5f0ade380db809b3ba67f08e95fe6c6c8641f96d67a51ff7e98a9bf30"
],
"index": "pypi",
"version": "==1.0.7"
},
"langid": {
"hashes": [
"sha256:044bcae1912dab85c33d8e98f2811b8f4ff1213e5e9a9e9510137b84da2cb293"
],
"index": "pypi",
"version": "==1.1.6"
},
"lxml": {
"hashes": [
"sha256:01c45df6d90497c20aa2a07789a41941f9a1029faa30bf725fc7f6d515b1afe9",
......@@ -262,6 +276,34 @@
],
"version": "==1.0"
},
"numpy": {
"hashes": [
"sha256:0739146eaf4985962f07c62f7133aca89f3a600faac891ce6c7f3a1e2afe5272",
"sha256:07e21f14490324cc1160db101e9b6c1233c33985af4cb1d301dd02650fea1d7f",
"sha256:0f6a5ed0cd7ab1da11f5c07a8ecada73fc55a70ef7bb6311a4109891341d7277",
"sha256:0fd65cbbfdbf76bbf80c445d923b3accefea0fe2c2082049e0ce947c81fe1d3f",
"sha256:20cac3123d791e4bf8482a580d98d6b5969ba348b9d5364df791ba3a666b660d",
"sha256:528ce59ded2008f9e8543e0146acb3a98a9890da00adf8904b1e18c82099418b",
"sha256:56e392b7c738bd70e6f46cf48c8194d3d1dd4c5a59fae4b30c58bb6ef86e5233",
"sha256:675e0f23967ce71067d12b6944add505d5f0a251f819cfb44bdf8ee7072c090d",
"sha256:6be6b0ca705321c178c9858e5ad5611af664bbdfae1df1541f938a840a103888",
"sha256:719d914f564f35cce4dc103808f8297c807c9f0297ac183ed81ae8b5650e698e",
"sha256:768e777cc1ffdbf97c507f65975c8686ebafe0f3dc8925d02ac117acc4669ce9",
"sha256:7f76d406c6b998d6410198dcb82688dcdaec7d846aa87e263ccf52efdcfeba30",
"sha256:8c18ee4dddd5c6a811930c0a7c7947bf16387da3b394725f6063f1366311187d",
"sha256:99051e03b445117b26028623f1a487112ddf61a09a27e2d25e6bc07d37d94f25",
"sha256:a1413d06abfa942ca0553bf3bccaff5fdb36d55b84f2248e36228db871147dab",
"sha256:a7157c9ac6bddd2908c35ef099e4b643bc0e0ebb4d653deb54891d29258dd329",
"sha256:a958bf9d4834c72dee4f91a0476e7837b8a2966dc6fcfc42c421405f98d0da51",
"sha256:bb370120de6d26004358611441e07acda26840e41dfedc259d7f8cc613f96495",
"sha256:d0928076d9bd8a98de44e79b1abe50c1456e7abbb40af7ef58092086f1a6c729",
"sha256:d858423f5ed444d494b15c4cc90a206e1b8c31354c781ac7584da0d21c09c1c3",
"sha256:e6120d63b50e2248219f53302af7ec6fa2a42ed1f37e9cda2c76dbaca65036a7",
"sha256:f2b1378b63bdb581d5d7af2ec0373c8d40d651941d283a2afd7fc71184b3f570",
"sha256:facc6f925c3099ac01a1f03758100772560a0b020fb9d70f210404be08006bcb"
],
"version": "==1.14.2"
},
"parsel": {
"hashes": [
"sha256:1a9ac0c1db8175547e1732be57ced2a2dc0714590f6b249d022ad25d918ef923",
......@@ -336,6 +378,13 @@
],
"version": "==0.2.1"
},
"pycountry": {
"hashes": [
"sha256:46e4b1a21516e41fe6f8c0ef7a9876da8ce9ac3f719e3fed79cf79fd9b6206ee"
],
"index": "pypi",
"version": "==18.2.23"
},
"pycparser": {
"hashes": [
"sha256:99a8ca03e29851d96616ad0404b4aad7d9ee16f25c9f9708a11faf2810f7b226"
......
import logging
from pycountry import languages
from langdetect import detect_langs
from langdetect.lang_detect_exception import LangDetectException
from langid.langid import LanguageIdentifier, model
__all__ = ['lang_name', 'lang_detect']
logger = logging.getLogger(__name__)
def lang_name(name):
if name is None:
return
try:
return languages.lookup(name).alpha_3
except LookupError:
logger.warning("Language %r not found", name, exc_info=True)
def langdetect_detect(text):
try:
candidates = detect_langs(text)
result = candidates[0]
return lang_name(result.lang), result.prob
except LangDetectException as e:
logger.warning("Couldn't detect language: %s", e)
_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
def langid_detect(text):
name, prob = _identifier.classify(text)
return lang_name(name), prob
lang_detect = langdetect_detect
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment