Commit 2440034b authored by Loïc Chapron's avatar Loïc Chapron

Merge branch 'dev-loic' into 'dev'

Dev loic

See merge request !5
parents 182ba320 c10c6adc
......@@ -66,7 +66,7 @@ with open(path, 'r') as corpus :
day = tmp[2]
else:
if doc.__contains__('year'):
year = doc['year']
year = doc['year'].replace('/','').replace('.','')
else:
year = str(date.today().year)
......
[server]
maxUploadSize = 200
[browser]
gatherUsageStats = false
[[pages]]
path = "Welcome.py"
name = "Home"
icon = ":house:"
[[pages]]
name = "Convert Tools"
icon = ":twisted_rightwards_arrows:"
is_section = true
[[pages]]
path = "pages/Clean_CSV_to_TSV.py"
name = "Clean CSV To TSV"
[[pages]]
path = "pages/CSV_Harzing_to_TSV.py"
name = "CSV Harzing To TSV"
[[pages]]
path = "pages/Istex_To_GarganText.py"
name = "Istex To GarganText"
[[pages]]
path = "pages/Pubmed_To_GarganText.py"
name = "Pubmed To GarganText"
[[pages]]
path = "pages/Ris_To_GarganText.py"
name = "RIS To GarganText"
[[pages]]
path = "pages/GarganText_Json_To_TSV.py"
name = "GarganText Json To TSV"
[[pages]]
name = "PDF Convert"
icon = ":twisted_rightwards_arrows:"
is_section = true
[[pages]]
path = "pages/PDF_to_TSV.py"
name = "PDF To TSV"
[[pages]]
path = "pages/PDF_to_TXT.py"
name = "PDF To TXT"
[[pages]]
path = "pages/TXT_to_TSV.py"
name = "TXT To TSV"
[[pages]]
name = "API Tools"
icon = ":globe_with_meridians:"
is_section = true
[[pages]]
path = "pages/HAL_To_GarganText.py"
name = "HAL To GarganText"
[[pages]]
path = "pages/Isidore_To_GarganText.py"
name = "Isidore To GarganText"
[[pages]]
path = "pages/Zotero_To_GarganText.py"
name = "Zotero To GarganText"
[[pages]]
path = "pages/TSV_Translator.py"
name = "TSV Translator"
[[pages]]
name = "Other Tools"
icon = ":wrench:"
is_section = true
[[pages]]
path = "pages/Merge_Term_GarganText.py"
name = "Merge GarganText Terms"
......@@ -4,10 +4,27 @@
```shell
pip install streamlit
pip install st-pages
```
```shell
pip install httpx[http2]
pip install youtube-transcript-api
```
## Start Project
```shell
streamlit run welcome.py
```
\ No newline at end of file
```
## About YTB to TSV tool
After encoutering many problems with the Youtube spam detection and blacklist, this tool has been suspended until further update if a solution happens to be found.
When the request to obtain the generated translation are made, Youtube start to block the programm after an undetermined number of requests to avoid spams.
Even after the implementation of a random delay between each request, it still occurs.
Another way to get the translations would be to use an API but most of them ask for personnal key or id, and the one which don't, Google Translate, is too much slow for all the translations.
Finally, the search of videos only with substitles in asked languages isn't proefficient enough, making the number of videos goten from that lower than the number the user ask for.
"""
Streamlit Application
Loïc Chapron
"""
import streamlit as st
import src.basic as tmp
tmp.base("Welcome")
st.set_page_config(
page_title="Hello"
)
st.write(st.session_state.general_text_dict['welcome'])
st.write(st.session_state.general_text_dict['tools'])
st.write(st.session_state.general_text_dict['code'])
st.write(st.session_state.general_text_dict['help'])
st.write("# Welcome to ")
locale,key,value
fr,title,"# CSV Harzing To TSV"
en,title,"# CSV Harzing To TSV"
fr,text,"Convertit un fichier CSV Harzing en un fichier TSV compatible avec Gargantext"
en,text,"Convert a CSV Harzing file into a TSV file compatible with GarganText"
fr,file,"Choisir un fichier"
en,file,"Choose a file"
fr,new_file,"Télécharge ton fichier TSV :"
en,new_file,"Download your TSV file :"
fr,submit," Soumettre "
en,submit," Submit "
\ No newline at end of file
locale,key,value
fr,title,"# Clean CSV To TSV"
en,title,"# Clean CSV To TSV"
fr,text,"Inspecte un fichier CSV pour vérifier s'il est compatible avec Gargantext."
en,text,"Inspect a CSV file to check if it is compatible with GarganText."
fr,file,"Choisir un fichier"
en,file,"Choose a file"
fr,new_file,"Télécharge ton fichier TSV :"
en,new_file,"Download your TSV file : "
fr,error,"Erreur : le fichier n'est pas compatible avec GarganText"
en,error,"Error : the file isn't valid"
fr,errorMessage,"Télécharge le message d'erreur : "
en,errorMessage,"Download the error message : "
fr,fill,"Remplir les espaces vides automatiquement"
en,fill,"Fill blank spaces automatically"
fr,encoding,"Erreur : le fichier n'est pas encodé en UTF-8"
en,encoding,"Error : the file is not encoded in UTF-8"
\ No newline at end of file
locale,key,value
fr,title,"# Json Vers TSV"
en,title,"# Json To TSV"
fr,text,"Transforme un corpus Json venant de Gargantext en TSV pour GarganText"
en,text,"Transform a Json corpus from GarganText to a TSV file for GarganText"
fr,file,"Choisir un fichier"
en,file,"Choose a file"
fr,new_file,"Télécharge ton fichier TSV :"
en,new_file,"Download your TSV file:"
fr,error,"Erreur : le fichier n'est pas valide"
en,error,"Error : the file isn't valid"
\ No newline at end of file
locale,key,value
fr,title,"# HAL vers GarganText"
en,title,"# HAL To GarganText"
fr,text,"HAL est une base de document scientifique en ligne et libre d'accès contenant plus d'un million de document."
en,text,"HAL is an online and free access scientific document database containing more than a million documents"
fr,keyword,"Mots clés"
en,keyword,"Key word"
fr,lang,"Langue des textes (si possible)"
en,lang,"Text languages (if possible)"
fr,submit,"Soumettre"
en,submit,"Submit"
fr,load_api,"Chargement de l'api..."
en,load_api,"Loading API..."
fr,overload_api,"L'API est surchargé, relancer la requête dans quelques secondes"
en,overload'api,"The API is overloaded, please retry the request in a few seconds"
fr,nb_doc,"Nombres de documents : "
en,nb_doc,"Numbers of documents : "
fr,perform1,"Pour des raisons de performence, on limit à "
fr,perform2," le nombre de document maximum"
en,perform1,"For performance reasons, we limit to "
en,perform2," the maximum number of documents"
fr,nb_taken,"Nombres de documents à prendre"
en,nb_taken,"Number of documents to take"
fr,createTSV,"Création du fichier TSV (Cela peut prendre quelque minutes)"
en,createTSV,"Creation of the TSV file (It may take a while)"
locale,key,value
fr,title,"# Isidore vers GarganText"
en,title,"# Isidore To GarganText"
fr,text,"Effectue une recherche Isidore de documents scientifiques et les convertir en un fichier TSV."
en,text,"Do a Isidore scientific documents research and convert it into a TSV file."
fr,keyword,"Mots clés"
en,keyword,"Key word"
fr,lang,"Langue des textes (si possible)"
en,lang,"Text languages (if possible)"
fr,submit,"Soumettre"
en,submit,"Submit"
fr,load_api,"Chargement de l'api..."
en,load_api,"Loading API..."
fr,overload_api,"L'API est surchargé, relancer la requête dans quelques secondes"
en,overload'api,"The API is overloaded, please retry the request in a few seconds"
fr,nb_doc,"Nombres de documents : "
en,nb_doc,"Numbers of documents : "
fr,perform1,"Pour des raisons de performence, on limit à "
fr,perform2," le nombre de document maximum"
en,perform1,"For performance reasons, we limit to "
en,perform2," the maximum number of documents"
fr,nb_taken,"Nombres de documents à prendre"
en,nb_taken,"Number of documents to take"
fr,createTSV,"Création du fichier TSV (Cela peut prendre quelque minutes)"
en,createTSV,"Creation of the TSV file (It may take a while)"
fr,doc_abstract1,"Il y a "
fr,doc_abstract2," documents qui peuvent ne pas avoir de description."
en,doc_abstract1,"There are "
en,doc_abstract2," documents who may not have an abstract"
\ No newline at end of file
locale,key,value
fr,title,"# Istex Vers GarganText"
en,title,"# Istex To GarganText"
fr,text,"Importe un ZIP de documents provenant d'Istex et le transforme en fichier TSV."
en,text,"Import a ZIP file coming from Istex and convert it into a TSV file."
fr,file,"Choisir un fichier"
en,file,"Choose a file"
fr,dup1,"Certains fichiers ("
fr,dup2,") ont été retirés pour divers raisons (fichier au mauvais format, fichier identique...)"
en,dup1,"Some file ("
en,dup2,") have been removed for various reasons (file with wrong format, file already present...)"
fr,new_file,"Télécharge ton fichier TSV :"
en,new_file,"Download your TSV file:"
fr,error,"Erreur : le fichier n'est pas valide"
en,error,"Error : the file isn't valid"
\ No newline at end of file
locale,key,value
fr,title,"# Merge GarganText Terms"
en,title,"# Merge GarganText Terms"
fr,text,"Fusionne 2 fichiers de termes de GarganText."
en,text,"Input 2 term files from GarganText."
fr,file," Choisir un fichier "
en,file," Choose a file "
fr,new_file," Télécharge ton fichier fusionné "
en,new_file," Download your merge file "
\ No newline at end of file
locale,key,value
fr,title,"# PDF To TSV"
en,title,"# PDF To TSV"
fr,text,"Convertit un ZIP de fichiers PDF en fichiers TSV compatibles avec Gargantext."
en,text,"Convert a ZIP of PDF files into TSV files compatible with GarganText."
fr,text2,"Cet outil détecte automatiquement les langues présentes au sein des PDF à l'aide de l'API Google Translate."
en,text2,"This tool detect automatically the languages of the PDF with the Google Translate API."
fr,text3,"Vous pouvez choisir le titre et le(s) auteur(s) et indiquer, s'il existe, le filigrane à supprimer pour ce PDF."
en,text3,"You can choose the title and the author(s) and specify, if it does exist, the watermark to erase for this PDF."
fr,file,"Choisir un fichier"
en,file,"Choose a file"
fr,new_file,"Télécharge le ZIP de tes fichiers TSV :"
en,new_file,"Download the ZIP of your TSV files : "
fr,author,"Auteur(s) : "
en,author,"Author(s) : "
fr,titlePDF,"Titre : "
en,titlePDF,"Title : "
fr,watermark,"Filigrane : "
en,watermark,"Watermark : "
fr,submit," Soumettre "
en,submit,"Submit "
fr,loading," Conversion du pdf en cours "
en,loading," Processing pdf conversion "
fr,warning,"Attention ! Plusieurs langues ont été détectées pour la source : "
fr,warning2,"Les langues suivantes ont été détectées : "
en,warning,"Warning ! Multiple languages have been detected at the source : "
en,warning2,"The following languages have been detected : "
fr,globalWarning, "Attention ! Plusieurs langues ont été détectées entre vos pdf !\nLes langues suivantes ont été détectées : "
en,globalWarning,"Warning ! Multiple languages have been detected for your pdfs file !\nThe following languages have been detected : "
fr,advice,"Cela pourrait affecter massivement l'analyse de GarganText.Vous pouvez régler ça en traduisant avec l'outil TsvTranslator."
en,advice,"This could massively affect the analysis of Gargantext.You can correct this by translation with the TsvTranslator tool."
locale,key,value
fr,title,"# PDF To TXT"
en,title,"# PDF To TXT"
fr,text,"Convertit un fichier PDF en fichier TXT"
en,text,"Convert a PDF file into a TXT file"
fr,file,"Choisir un fichier"
en,file,"Choose a file"
fr,new_file,"Télécharge ton fichier TXT :"
en,new_file,"Download your TXT file: "
fr,watermark,"Filigrane : "
en,watermark,"Watermark : "
fr,submit," Soumettre "
en,submit,"Submit "
locale,key,value
fr,title,"# Pubmed Vers GarganText"
en,title,"# Pubmed To GarganText"
fr,text,"Transforme un corpus pubmed en TSV pour GarganText"
en,text,"Transform a pubmed corpus to a TSV file for GarganText"
fr,file,"Choisir un fichier"
en,file,"Choose a file"
fr,new_file,"Télécharge ton fichier TSV :"
en,new_file,"Download your TSV file:"
fr,error,"Erreur : le fichier n'est pas valide"
en,error,"Error : the file isn't valid"
\ No newline at end of file
locale,key,value
fr,title,"# RIS Vers GarganText"
en,title,"# RIS To GarganText"
fr,text,"Transforme un corpus RIS en fichier TSV pour GarganText"
en,text,"Transform a RIS corpus to a TSV file for GarganText"
fr,file,"Choisir un fichier"
en,file,"Choose a file"
fr,new_file,"Télécharge ton fichier TSV :"
en,new_file,"Download your TSV file:"
fr,error,"Erreur : le fichier n'est pas valide"
en,error,"Error : the file isn't valid"
\ No newline at end of file
locale,key,value
fr,title,"# TXT To TSV"
en,title,"# TXT To TSV"
fr,text,"Convertit un fichier TXT en un fichier TSV compatible avec Gargantext"
en,text,"Convert a TXT file into a TSV file compatible with GarganText"
fr,text2,"Cet outil détecte automatiquement les langues présentes au sein des PDF à l'aide de l'API Google Translate."
en,text2,"This tool detect automatically the languages of the PDF with the Google Translate API."
fr,text3,"Vous pouvez choisir le titre et le(s) auteur(s) et indiquer, s'il existe, le filigrane de ce PDF."
en,text3,"You can choose the title and the author(s) and specify, if it does exist, the watermark for this PDF."
fr,file,"Choisir un fichier"
en,file,"Choose a file"
fr,new_file,"Télécharge ton fichier TSV :"
en,new_file,"Download your TSV file : "
fr,author,"Auteur(s) : "
en,author,"Author(s) : "
fr,titlePDF,"Titre : "
en,titlePDF,"Title : "
fr,submit," Soumettre "
en,submit,"Submit "
fr,warning,"Attention ! Plusieurs langues ont été détectées pour la source : "
fr,warning2,"Les langues suivantes ont été détectées : "
en,warning,"Warning ! Multiple languages have been detected at the source : "
en,warning2,"The following languages have been detected : "
fr,advice,"Cela pourrait affecter massivement l'analyse de GarganText. Vous pouvez régler ça en traduisant avec l'outil TsvTranslator."
en,advice,"This could massively affect the analysis of Gargantext.\nYou can correct this by translation with the TsvTranslator tool."
locale,key,value
fr,title,"# Traducteur de TSV"
en,title,"# TSV Translator"
fr,text,"Traduit un fichier TSV dans la langue de votre choix"
en,text,"Translate a TSV file in the chosen language"
fr,text2,"Cet outil détecte automatiquement les langues présentes au sein du PDF et le traduit à l'aide de l'API Google Translate. Il est donc soumit aux limitations de cet API."
en,text2,"This tool detect automatically the languages of the PDF and translate it with the Google Translate API. He is then under the limits of this API."
fr,file,"Choisir un fichier"
en,file,"Choose a file"
fr,new_file,"Télécharge ton fichier TSV traduit:"
en,new_file,"Download your translated TSV file : "
fr,submit," Soumettre "
en,submit,"Submit "
fr,detect," Détecter les langues "
en,detect," Detect languages"
fr,translate1,"Traduire de "
en,translate1,"Translate from "
fr,translate2," Vers "
en,translate2," To "
fr,detected,"Langues détectées : "
en,detected,"Detected languages : "
fr,loading,"Progression de la traduction : "
en,loading,"Translation progress : "
fr,loadingLanguages," Analyse des langues du fichier "
en,loadingLanguages," File languages analysis "
fr,sameLanguages,"Une seule langue a été détectée au sein du fichier : "
en,sameLanguages,"Only one language has been detected inside this file : "
fr,anotherFile," Traduire un autre fichier "
en,anotherFile," Translate another file "
locale,key,value
fr,title,"# Bienvenue sur GarganTools"
en,title,"# Welcome to GarganTools"
fr,welcome,"Bienvenue sur ces pages rassemblant des outils développés par des utilisateurs de GarganText pour des utilisateurs de GarganText."
en,welcome,"Welcome to these pages featuring tools developed by GarganText’ users for GarganText’ users."
fr,tools,"Les outils proposés ici ne demandent pas de ressources de calcul mais permettent de transformer des données pour faciliter une création de corpus dans un format adapté à GarganText ou au contraire d’exploiter des fichiers traités préalablement dans GarganText."
en,tools,"The tools offered here do not require computational resources, but can be used to transform data to facilitate the creation of a corpus in a format suitable for GarganText, or to exploit files previously processed in GarganText."
fr,code,"Le code de ces pages, ainsi que des outils utilisés ici sont disponibles en « open source » dans un [Gitlab dédié hébergé par l’ISC-PIF](https://gitlab.iscpif.fr/athomas/gargantexternal-tools)."
en,code,"The code for these pages, as well as the tools used here, are available as open source in a dedicated [Gitlab hosted by ISC-PIF](https://gitlab.iscpif.fr/athomas/gargantexternal-tools)."
fr,help,"N’hésitez pas à contribuer, à permettre à ces outils de s’enrichir et d’offrir de nouvelles possibilités."
en,help,"Don't hesitate to contribute, to help these tools grow and offer new possibilities."
\ No newline at end of file
locale,key,value
fr,title,"# Youtube To TSV"
en,title,"# Youtube To TSV"
fr,text,"Effectue une recherche Youtube à l'aide de mots clés (thème, titre de vidéo, lien de vidéo,...) pour créer un fichier TSV à partir des sous-titres de vidéos."
en,text,"Do a Youtube research with keywords (topic, video title, video link,...) to create a TSV file based on the subtitles of the videos."
fr,text2,"Afin d'éviter un blocage pour spam par Youtube lors des requêtes, un certain délai est établi entre chacunes d'elles. Merci de bien vouloir patienter."
en,text2,"To avoid a blacklist from Youtube when doing requests, a delay is etablished between each of it. Please be patient during the process."
fr,videoLang,"Langues des vidéos Youtube : "
en,videoLang,"Youtube videos languages : "
fr,file,"Choisir un fichier"
en,file,"Choose a file"
fr,keywords,"Mots-clés : "
en,keywords,"Keywords : "
fr,number,"Nombre de vidéos recherchés : "
en,number,"Number of searched videos : "
fr,fill,"Sous-titres manuels uniquement (temps d'attente plus long)"
en,fill,"Only manual subtitles (longer waiting time)"
fr,submit," Soumettre "
en,submit," Submit "
fr,loadingID," Recherche de vidéos "
en,loadingID," Searching videos "
fr,loading,"Traitement des vidéos : "
en,loading,"Videos processing : "
fr,quantity," sur "
en,quantity," out of "
fr,new_file,"Télécharge ton fichier TSV :"
en,new_file,"Download your TSV file :"
locale,key,value
fr,title,"# Zotero vers GarganText"
en,title,"# Zotero vers GarganText"
fr,data,"Type de donnée"
en,data,"Type of data"
fr,help,"Trouvé votre ID d'utilisateur ici: https://www.zotero.org/settings/keys"
en,help,"Find your user ID here: https://www.zotero.org/settings/keys"
fr,submit,"Suivant"
en,submit,"Submit"
fr,denied,"L'acèss au compte n'est pas publique, pour la mettre publique: https://www.zotero.org/settings/privacy"
en,denied,"Account access is not public, to make it public: https://www.zotero.org/settings/privacy"
fr,add_doc,"*Ajouter les documents que vous voulez mettre dans le TSV*"
en,add_doc,"*Add the document that tou want in the TSV*"
fr,select_all,"Select All"
en,select_all,"Select All"
fr,search,"Recherche"
en,search,"Search"
fr,p_page,"Page Précédente"
en,p_page,"Previous Page"
fr,n_page,"Page Suivante"
en,n_page,"Next Page"
fr,add_collect,"**Selectionner une collection** vous pouvez en choisir plusieurs"
en,add_collect,"**Chose a collection** you can choose multiple one"
fr,chose_collect,"Choisie une collection"
en,chose_collect,"Chose a collection"
fr,fileTSV1,"Le TSV contient "
fr,fileTSV2," documents"
en,fileTSV1,"The TSV file got "
en,fileTSV2," documents"
fr,back,"Retour"
en,back,"Back"
This diff is collapsed.
Googletrans
===========
|GitHub license| |travis status| |Documentation Status| |PyPI version|
|Coverage Status| |Code Climate|
Googletrans is a **free** and **unlimited** python library that
implemented Google Translate API. This uses the `Google Translate Ajax
API <https://translate.google.com>`__ to make calls to such methods as
detect and translate.
Compatible with Python 3.6+.
For details refer to the `API
Documentation <https://py-googletrans.readthedocs.io/en/latest>`__.
Features
--------
- Fast and reliable - it uses the same servers that
translate.google.com uses
- Auto language detection
- Bulk translations
- Customizable service URL
- HTTP/2 support
TODO
~~~~
more features are coming soon.
- Proxy support
- Internal session management (for better bulk translations)
HTTP/2 support
~~~~~~~~~~~~~~
This library uses httpx for HTTP requests so HTTP/2 is supported by default.
You can check if http2 is enabled and working by the `._response.http_version` of `Translated` or `Detected` object:
.. code:: python
>>> translator.translate('테스트')._response.http_version
# 'HTTP/2'
How does this library work
~~~~~~~~~~~~~~~~~~~~~~~~~~
You may wonder why this library works properly, whereas other
approaches such like goslate won't work since Google has updated its
translation service recently with a ticket mechanism to prevent a lot of
crawler programs.
I eventually figure out a way to generate a ticket by reverse
engineering on the `obfuscated and minified code used by Google to
generate such
token <https://translate.google.com/translate/releases/twsfe_w_20170306_RC00/r/js/desktop_module_main.js>`__,
and implemented on the top of Python. However, this could be blocked at
any time.
--------------
Installation
------------
To install, either use things like pip with the package "googletrans"
or download the package and put the "googletrans" directory into your
python path.
.. code:: bash
$ pip install googletrans
Basic Usage
-----------
If source language is not given, google translate attempts to detect the
source language.
.. code:: python
>>> from googletrans import Translator
>>> translator = Translator()
>>> translator.translate('안녕하세요.')
# <Translated src=ko dest=en text=Good evening. pronunciation=Good evening.>
>>> translator.translate('안녕하세요.', dest='ja')
# <Translated src=ko dest=ja text=こんにちは。 pronunciation=Kon'nichiwa.>
>>> translator.translate('veritas lux mea', src='la')
# <Translated src=la dest=en text=The truth is my light pronunciation=The truth is my light>
Customize service URL
~~~~~~~~~~~~~~~~~~~~~
You can use another google translate domain for translation. If multiple
URLs are provided, it then randomly chooses a domain.
.. code:: python
>>> from googletrans import Translator
>>> translator = Translator(service_urls=[
'translate.google.com',
'translate.google.co.kr',
])
Customize service URL to point to standard api
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Considering translate.google.<domain> url services use the webapp requiring a token,
you can prefer to use the direct api than does not need any token to process.
It can solve your problems of unstable token providing processes (refer to issue #234)
.. code:: python
>>> from googletrans import Translator
>>> translator = Translator(service_urls=[
'translate.googleapis.com'
])
Advanced Usage (Bulk)
~~~~~~~~~~~~~~~~~~~~~
Array can be used to translate a batch of strings in a single method
call and a single HTTP session. The exact same method shown above works
for arrays as well.
.. code:: python
>>> translations = translator.translate(['The quick brown fox', 'jumps over', 'the lazy dog'], dest='ko')
>>> for translation in translations:
... print(translation.origin, ' -> ', translation.text)
# The quick brown fox -> 빠른 갈색 여우
# jumps over -> 이상 점프
# the lazy dog -> 게으른 개
Language detection
~~~~~~~~~~~~~~~~~~
The detect method, as its name implies, identifies the language used in
a given sentence.
.. code:: python
>>> from googletrans import Translator
>>> translator = Translator()
>>> translator.detect('이 문장은 한글로 쓰여졌습니다.')
# <Detected lang=ko confidence=0.27041003>
>>> translator.detect('この文章は日本語で書かれました。')
# <Detected lang=ja confidence=0.64889508>
>>> translator.detect('This sentence is written in English.')
# <Detected lang=en confidence=0.22348526>
>>> translator.detect('Tiu frazo estas skribita en Esperanto.')
# <Detected lang=eo confidence=0.10538048>
GoogleTrans as a command line application
-----------------------------------------
.. code:: bash
$ translate -h
usage: translate [-h] [-d DEST] [-s SRC] [-c] text
Python Google Translator as a command-line tool
positional arguments:
text The text you want to translate.
optional arguments:
-h, --help show this help message and exit
-d DEST, --dest DEST The destination language you want to translate.
(Default: en)
-s SRC, --src SRC The source language you want to translate. (Default:
auto)
-c, --detect
$ translate "veritas lux mea" -s la -d en
[veritas] veritas lux mea
->
[en] The truth is my light
[pron.] The truth is my light
$ translate -c "안녕하세요."
[ko, 1] 안녕하세요.
--------------
Note on library usage
---------------------
DISCLAIMER: this is an unofficial library using the web API of translate.google.com
and also is not associated with Google.
- **The maximum character limit on a single text is 15k.**
- Due to limitations of the web version of google translate, this API
does not guarantee that the library would work properly at all times
(so please use this library if you don't care about stability).
- **Important:** If you want to use a stable API, I highly recommend you to use
`Google's official translate
API <https://cloud.google.com/translate/docs>`__.
- If you get HTTP 5xx error or errors like #6, it's probably because
Google has banned your client IP address.
--------------
Versioning
----------
This library follows `Semantic Versioning <http://semver.org/>`__ from
v2.0.0. Any release versioned 0.x.y is subject to backwards incompatible
changes at any time.
Contributing
-------------------------
Contributions are more than welcomed. See
`CONTRIBUTING.md <CONTRIBUTING.md>`__
-----------------------------------------
License
-------
Googletrans is licensed under the MIT License. The terms are as
follows:
::
The MIT License (MIT)
Copyright (c) 2015 SuHun Han
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
.. |GitHub license| image:: https://img.shields.io/github/license/mashape/apistatus.svg
:target: http://opensource.org/licenses/MIT
.. |travis status| image:: https://travis-ci.org/ssut/py-googletrans.svg?branch=master
:target: https://travis-ci.org/ssut/py-googletrans
.. |Documentation Status| image:: https://readthedocs.org/projects/py-googletrans/badge/?version=latest
:target: https://readthedocs.org/projects/py-googletrans/?badge=latest
.. |PyPI version| image:: https://badge.fury.io/py/googletrans.svg
:target: http://badge.fury.io/py/googletrans
.. |Coverage Status| image:: https://coveralls.io/repos/github/ssut/py-googletrans/badge.svg
:target: https://coveralls.io/github/ssut/py-googletrans
.. |Code Climate| image:: https://codeclimate.com/github/ssut/py-googletrans/badges/gpa.svg
:target: https://codeclimate.com/github/ssut/py-googletrans
This diff is collapsed.
MANIFEST.in
README.rst
setup.cfg
setup.py
translate
googletrans/__init__.py
googletrans/client.py
googletrans/constants.py
googletrans/gtoken.py
googletrans/models.py
googletrans/urls.py
googletrans/utils.py
googletrans.egg-info/PKG-INFO
googletrans.egg-info/SOURCES.txt
googletrans.egg-info/dependency_links.txt
googletrans.egg-info/requires.txt
googletrans.egg-info/top_level.txt
\ No newline at end of file
"""Free Google Translate API for Python. Translates totally free of charge."""
__all__ = 'Translator',
__version__ = '4.0.0-rc.1'
from .client import Translator
from .constants import LANGCODES, LANGUAGES # noqa
This diff is collapsed.
DEFAULT_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
DEFAULT_CLIENT_SERVICE_URLS = (
'translate.google.com',
)
DEFAULT_FALLBACK_SERVICE_URLS = (
'translate.googleapis.com',
)
DEFAULT_SERVICE_URLS = ('translate.google.ac', 'translate.google.ad', 'translate.google.ae',
'translate.google.al', 'translate.google.am', 'translate.google.as',
'translate.google.at', 'translate.google.az', 'translate.google.ba',
'translate.google.be', 'translate.google.bf', 'translate.google.bg',
'translate.google.bi', 'translate.google.bj', 'translate.google.bs',
'translate.google.bt', 'translate.google.by', 'translate.google.ca',
'translate.google.cat', 'translate.google.cc', 'translate.google.cd',
'translate.google.cf', 'translate.google.cg', 'translate.google.ch',
'translate.google.ci', 'translate.google.cl', 'translate.google.cm',
'translate.google.cn', 'translate.google.co.ao', 'translate.google.co.bw',
'translate.google.co.ck', 'translate.google.co.cr', 'translate.google.co.id',
'translate.google.co.il', 'translate.google.co.in', 'translate.google.co.jp',
'translate.google.co.ke', 'translate.google.co.kr', 'translate.google.co.ls',
'translate.google.co.ma', 'translate.google.co.mz', 'translate.google.co.nz',
'translate.google.co.th', 'translate.google.co.tz', 'translate.google.co.ug',
'translate.google.co.uk', 'translate.google.co.uz', 'translate.google.co.ve',
'translate.google.co.vi', 'translate.google.co.za', 'translate.google.co.zm',
'translate.google.co.zw', 'translate.google.com.af', 'translate.google.com.ag',
'translate.google.com.ai', 'translate.google.com.ar', 'translate.google.com.au',
'translate.google.com.bd', 'translate.google.com.bh', 'translate.google.com.bn',
'translate.google.com.bo', 'translate.google.com.br', 'translate.google.com.bz',
'translate.google.com.co', 'translate.google.com.cu', 'translate.google.com.cy',
'translate.google.com.do', 'translate.google.com.ec', 'translate.google.com.eg',
'translate.google.com.et', 'translate.google.com.fj', 'translate.google.com.gh',
'translate.google.com.gi', 'translate.google.com.gt', 'translate.google.com.hk',
'translate.google.com.jm', 'translate.google.com.kh', 'translate.google.com.kw',
'translate.google.com.lb', 'translate.google.com.ly', 'translate.google.com.mm',
'translate.google.com.mt', 'translate.google.com.mx', 'translate.google.com.my',
'translate.google.com.na', 'translate.google.com.ng', 'translate.google.com.ni',
'translate.google.com.np', 'translate.google.com.om', 'translate.google.com.pa',
'translate.google.com.pe', 'translate.google.com.pg', 'translate.google.com.ph',
'translate.google.com.pk', 'translate.google.com.pr', 'translate.google.com.py',
'translate.google.com.qa', 'translate.google.com.sa', 'translate.google.com.sb',
'translate.google.com.sg', 'translate.google.com.sl', 'translate.google.com.sv',
'translate.google.com.tj', 'translate.google.com.tr', 'translate.google.com.tw',
'translate.google.com.ua', 'translate.google.com.uy', 'translate.google.com.vc',
'translate.google.com.vn', 'translate.google.com', 'translate.google.cv',
'translate.google.cz', 'translate.google.de', 'translate.google.dj',
'translate.google.dk', 'translate.google.dm', 'translate.google.dz',
'translate.google.ee', 'translate.google.es', 'translate.google.eu',
'translate.google.fi', 'translate.google.fm', 'translate.google.fr',
'translate.google.ga', 'translate.google.ge', 'translate.google.gf',
'translate.google.gg', 'translate.google.gl', 'translate.google.gm',
'translate.google.gp', 'translate.google.gr', 'translate.google.gy',
'translate.google.hn', 'translate.google.hr', 'translate.google.ht',
'translate.google.hu', 'translate.google.ie', 'translate.google.im',
'translate.google.io', 'translate.google.iq', 'translate.google.is',
'translate.google.it', 'translate.google.je', 'translate.google.jo',
'translate.google.kg', 'translate.google.ki', 'translate.google.kz',
'translate.google.la', 'translate.google.li', 'translate.google.lk',
'translate.google.lt', 'translate.google.lu', 'translate.google.lv',
'translate.google.md', 'translate.google.me', 'translate.google.mg',
'translate.google.mk', 'translate.google.ml', 'translate.google.mn',
'translate.google.ms', 'translate.google.mu', 'translate.google.mv',
'translate.google.mw', 'translate.google.ne', 'translate.google.nf',
'translate.google.nl', 'translate.google.no', 'translate.google.nr',
'translate.google.nu', 'translate.google.pl', 'translate.google.pn',
'translate.google.ps', 'translate.google.pt', 'translate.google.ro',
'translate.google.rs', 'translate.google.ru', 'translate.google.rw',
'translate.google.sc', 'translate.google.se', 'translate.google.sh',
'translate.google.si', 'translate.google.sk', 'translate.google.sm',
'translate.google.sn', 'translate.google.so', 'translate.google.sr',
'translate.google.st', 'translate.google.td', 'translate.google.tg',
'translate.google.tk', 'translate.google.tl', 'translate.google.tm',
'translate.google.tn', 'translate.google.to', 'translate.google.tt',
'translate.google.us', 'translate.google.vg', 'translate.google.vu',
'translate.google.ws')
SPECIAL_CASES = {
'ee': 'et',
}
LANGUAGES = {
'af': 'afrikaans',
'sq': 'albanian',
'am': 'amharic',
'ar': 'arabic',
'hy': 'armenian',
'az': 'azerbaijani',
'eu': 'basque',
'be': 'belarusian',
'bn': 'bengali',
'bs': 'bosnian',
'bg': 'bulgarian',
'ca': 'catalan',
'ceb': 'cebuano',
'ny': 'chichewa',
'zh-cn': 'chinese (simplified)',
'zh-tw': 'chinese (traditional)',
'co': 'corsican',
'hr': 'croatian',
'cs': 'czech',
'da': 'danish',
'nl': 'dutch',
'en': 'english',
'eo': 'esperanto',
'et': 'estonian',
'tl': 'filipino',
'fi': 'finnish',
'fr': 'french',
'fy': 'frisian',
'gl': 'galician',
'ka': 'georgian',
'de': 'german',
'el': 'greek',
'gu': 'gujarati',
'ht': 'haitian creole',
'ha': 'hausa',
'haw': 'hawaiian',
'iw': 'hebrew',
'he': 'hebrew',
'hi': 'hindi',
'hmn': 'hmong',
'hu': 'hungarian',
'is': 'icelandic',
'ig': 'igbo',
'id': 'indonesian',
'ga': 'irish',
'it': 'italian',
'ja': 'japanese',
'jw': 'javanese',
'kn': 'kannada',
'kk': 'kazakh',
'km': 'khmer',
'ko': 'korean',
'ku': 'kurdish (kurmanji)',
'ky': 'kyrgyz',
'lo': 'lao',
'la': 'latin',
'lv': 'latvian',
'lt': 'lithuanian',
'lb': 'luxembourgish',
'mk': 'macedonian',
'mg': 'malagasy',
'ms': 'malay',
'ml': 'malayalam',
'mt': 'maltese',
'mi': 'maori',
'mr': 'marathi',
'mn': 'mongolian',
'my': 'myanmar (burmese)',
'ne': 'nepali',
'no': 'norwegian',
'or': 'odia',
'ps': 'pashto',
'fa': 'persian',
'pl': 'polish',
'pt': 'portuguese',
'pa': 'punjabi',
'ro': 'romanian',
'ru': 'russian',
'sm': 'samoan',
'gd': 'scots gaelic',
'sr': 'serbian',
'st': 'sesotho',
'sn': 'shona',
'sd': 'sindhi',
'si': 'sinhala',
'sk': 'slovak',
'sl': 'slovenian',
'so': 'somali',
'es': 'spanish',
'su': 'sundanese',
'sw': 'swahili',
'sv': 'swedish',
'tg': 'tajik',
'ta': 'tamil',
'te': 'telugu',
'th': 'thai',
'tr': 'turkish',
'uk': 'ukrainian',
'ur': 'urdu',
'ug': 'uyghur',
'uz': 'uzbek',
'vi': 'vietnamese',
'cy': 'welsh',
'xh': 'xhosa',
'yi': 'yiddish',
'yo': 'yoruba',
'zu': 'zulu',
}
LANGCODES = dict(map(reversed, LANGUAGES.items()))
DEFAULT_RAISE_EXCEPTION = False
DUMMY_DATA = [[["", None, None, 0]], None, "en", None,
None, None, 1, None, [["en"], None, [1], ["en"]]]
# -*- coding: utf-8 -*-
import ast
import math
import re
import time
import httpx
from .utils import rshift
class TokenAcquirer:
"""Google Translate API token generator
translate.google.com uses a token to authorize the requests. If you are
not Google, you do have this token and will have to pay for use.
This class is the result of reverse engineering on the obfuscated and
minified code used by Google to generate such token.
The token is based on a seed which is updated once per hour and on the
text that will be translated.
Both are combined - by some strange math - in order to generate a final
token (e.g. 744915.856682) which is used by the API to validate the
request.
This operation will cause an additional request to get an initial
token from translate.google.com.
Example usage:
>>> from googletrans.gtoken import TokenAcquirer
>>> acquirer = TokenAcquirer()
>>> text = 'test'
>>> tk = acquirer.do(text)
>>> tk
950629.577246
"""
RE_TKK = re.compile(r'tkk:\'(.+?)\'', re.DOTALL)
RE_RAWTKK = re.compile(r'tkk:\'(.+?)\'', re.DOTALL)
def __init__(self, client: httpx.Client, tkk='0', host='translate.google.com'):
self.client = client
self.tkk = tkk
self.host = host if 'http' in host else 'https://' + host
def _update(self):
"""update tkk
"""
# we don't need to update the base TKK value when it is still valid
now = math.floor(int(time.time() * 1000) / 3600000.0)
if self.tkk and int(self.tkk.split('.')[0]) == now:
return
r = self.client.get(self.host)
raw_tkk = self.RE_TKK.search(r.text)
if raw_tkk:
self.tkk = raw_tkk.group(1)
return
try:
# this will be the same as python code after stripping out a reserved word 'var'
code = self.RE_TKK.search(r.text).group(1).replace('var ', '')
# unescape special ascii characters such like a \x3d(=)
code = code.encode().decode('unicode-escape')
except AttributeError:
raise Exception('Could not find TKK token for this request.\nSee https://github.com/ssut/py-googletrans/issues/234 for more details.')
except:
raise
if code:
tree = ast.parse(code)
visit_return = False
operator = '+'
n, keys = 0, dict(a=0, b=0)
for node in ast.walk(tree):
if isinstance(node, ast.Assign):
name = node.targets[0].id
if name in keys:
if isinstance(node.value, ast.Num):
keys[name] = node.value.n
# the value can sometimes be negative
elif isinstance(node.value, ast.UnaryOp) and \
isinstance(node.value.op, ast.USub): # pragma: nocover
keys[name] = -node.value.operand.n
elif isinstance(node, ast.Return):
# parameters should be set after this point
visit_return = True
elif visit_return and isinstance(node, ast.Num):
n = node.n
elif visit_return and n > 0:
# the default operator is '+' but implement some more for
# all possible scenarios
if isinstance(node, ast.Add): # pragma: nocover
pass
elif isinstance(node, ast.Sub): # pragma: nocover
operator = '-'
elif isinstance(node, ast.Mult): # pragma: nocover
operator = '*'
elif isinstance(node, ast.Pow): # pragma: nocover
operator = '**'
elif isinstance(node, ast.BitXor): # pragma: nocover
operator = '^'
# a safety way to avoid Exceptions
clause = compile('{1}{0}{2}'.format(
operator, keys['a'], keys['b']), '', 'eval')
value = eval(clause, dict(__builtin__={}))
result = '{}.{}'.format(n, value)
self.tkk = result
def _lazy(self, value):
"""like lazy evaluation, this method returns a lambda function that
returns value given.
We won't be needing this because this seems to have been built for
code obfuscation.
the original code of this method is as follows:
... code-block: javascript
var ek = function(a) {
return function() {
return a;
};
}
"""
return lambda: value
def _xr(self, a, b):
size_b = len(b)
c = 0
while c < size_b - 2:
d = b[c + 2]
d = ord(d[0]) - 87 if 'a' <= d else int(d)
d = rshift(a, d) if '+' == b[c + 1] else a << d
a = a + d & 4294967295 if '+' == b[c] else a ^ d
c += 3
return a
def acquire(self, text):
a = []
# Convert text to ints
for i in text:
val = ord(i)
if val < 0x10000:
a += [val]
else:
# Python doesn't natively use Unicode surrogates, so account for those
a += [
math.floor((val - 0x10000) / 0x400 + 0xD800),
math.floor((val - 0x10000) % 0x400 + 0xDC00)
]
b = self.tkk if self.tkk != '0' else ''
d = b.split('.')
b = int(d[0]) if len(d) > 1 else 0
# assume e means char code array
e = []
g = 0
size = len(a)
while g < size:
l = a[g]
# just append if l is less than 128(ascii: DEL)
if l < 128:
e.append(l)
# append calculated value if l is less than 2048
else:
if l < 2048:
e.append(l >> 6 | 192)
else:
# append calculated value if l matches special condition
if (l & 64512) == 55296 and g + 1 < size and \
a[g + 1] & 64512 == 56320:
g += 1
l = 65536 + ((l & 1023) << 10) + (a[g] & 1023) # This bracket is important
e.append(l >> 18 | 240)
e.append(l >> 12 & 63 | 128)
else:
e.append(l >> 12 | 224)
e.append(l >> 6 & 63 | 128)
e.append(l & 63 | 128)
g += 1
a = b
for i, value in enumerate(e):
a += value
a = self._xr(a, '+-a^+6')
a = self._xr(a, '+-3^+b+-f')
a ^= int(d[1]) if len(d) > 1 else 0
if a < 0: # pragma: nocover
a = (a & 2147483647) + 2147483648
a %= 1000000 # int(1E6)
return '{}.{}'.format(a, a ^ b)
def do(self, text):
self._update()
tk = self.acquire(text)
return tk
from httpx import Response
from typing import List
class Base:
def __init__(self, response: Response = None):
self._response = response
class TranslatedPart:
def __init__(self, text: str, candidates: List[str]):
self.text = text
self.candidates = candidates
def __str__(self):
return self.text
def __dict__(self):
return {
'text': self.text,
'candidates': self.candidates,
}
class Translated(Base):
"""Translate result object
:param src: source language (default: auto)
:param dest: destination language (default: en)
:param origin: original text
:param text: translated text
:param pronunciation: pronunciation
"""
def __init__(self, src, dest, origin, text, pronunciation, parts: List[TranslatedPart],
extra_data=None, **kwargs):
super().__init__(**kwargs)
self.src = src
self.dest = dest
self.origin = origin
self.text = text
self.pronunciation = pronunciation
self.parts = parts
self.extra_data = extra_data
def __str__(self): # pragma: nocover
return self.__unicode__()
def __unicode__(self): # pragma: nocover
return (
u'Translated(src={src}, dest={dest}, text={text}, pronunciation={pronunciation}, '
u'extra_data={extra_data})'.format(
src=self.src, dest=self.dest, text=self.text,
pronunciation=self.pronunciation,
extra_data='"' + repr(self.extra_data)[:10] + '..."'
)
)
def __dict__(self):
return {
'src': self.src,
'dest': self.dest,
'origin': self.origin,
'text': self.text,
'pronunciation': self.pronunciation,
'extra_data': self.extra_data,
'parts': list(map(lambda part: part.__dict__(), self.parts)),
}
class Detected(Base):
"""Language detection result object
:param lang: detected language
:param confidence: the confidence of detection result (0.00 to 1.00)
"""
def __init__(self, lang, confidence, **kwargs):
super().__init__(**kwargs)
self.lang = lang
self.confidence = confidence
def __str__(self): # pragma: nocover
return self.__unicode__()
def __unicode__(self): # pragma: nocover
return u'Detected(lang={lang}, confidence={confidence})'.format(
lang=self.lang, confidence=self.confidence)
# -*- coding: utf-8 -*-
"""
Predefined URLs used to make google translate requests.
"""
BASE = 'https://translate.google.com'
TRANSLATE = 'https://{host}/translate_a/single'
TRANSLATE_RPC = 'https://{host}/_/TranslateWebserverUi/data/batchexecute'
"""A conversion module for googletrans"""
import json
import re
def build_params(client, query, src, dest, token, override):
params = {
'client': client,
'sl': src,
'tl': dest,
'hl': dest,
'dt': ['at', 'bd', 'ex', 'ld', 'md', 'qca', 'rw', 'rm', 'ss', 't'],
'ie': 'UTF-8',
'oe': 'UTF-8',
'otf': 1,
'ssel': 0,
'tsel': 0,
'q': query,
}
if token != '':
params['tk'] = token
if override is not None:
for key, value in get_items(override):
params[key] = value
return params
def legacy_format_json(original):
# save state
states = []
text = original
# save position for double-quoted texts
for i, pos in enumerate(re.finditer('"', text)):
# pos.start() is a double-quote
p = pos.start() + 1
if i % 2 == 0:
nxt = text.find('"', p)
states.append((p, text[p:nxt]))
# replace all wiered characters in text
while text.find(',,') > -1:
text = text.replace(',,', ',null,')
while text.find('[,') > -1:
text = text.replace('[,', '[null,')
# recover state
for i, pos in enumerate(re.finditer('"', text)):
p = pos.start() + 1
if i % 2 == 0:
j = int(i / 2)
nxt = text.find('"', p)
# replacing a portion of a string
# use slicing to extract those parts of the original string to be kept
text = text[:p] + states[j][1] + text[nxt:]
converted = json.loads(text)
return converted
def get_items(dict_object):
for key in dict_object:
yield key, dict_object[key]
def format_json(original):
try:
converted = json.loads(original)
except ValueError:
converted = legacy_format_json(original)
return converted
def rshift(val, n):
"""python port for '>>>'(right shift with padding)
"""
return (val % 0x100000000) >> n
[metadata]
description-file = README.md
[egg_info]
tag_build =
tag_date = 0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os.path
import re
from setuptools import setup, find_packages
def get_file(*paths):
path = os.path.join(*paths)
try:
with open(path, 'rb') as f:
return f.read().decode('utf8')
except IOError:
pass
def get_version():
init_py = get_file(os.path.dirname(__file__), 'googletrans', '__init__.py')
pattern = r"{0}\W*=\W*'([^']+)'".format('__version__')
version, = re.findall(pattern, init_py)
return version
def get_description():
init_py = get_file(os.path.dirname(__file__), 'googletrans', '__init__.py')
pattern = r'"""(.*?)"""'
description, = re.findall(pattern, init_py, re.DOTALL)
return description
def get_readme():
return get_file(os.path.dirname(__file__), 'README.rst')
def install():
setup(
name='googletrans',
version=get_version(),
description=get_description(),
long_description=get_readme(),
license='MIT',
author='SuHun Han',
author_email='ssut' '@' 'ssut.me',
url='https://github.com/ssut/py-googletrans',
classifiers=['Development Status :: 5 - Production/Stable',
'Intended Audience :: Education',
'Intended Audience :: End Users/Desktop',
'License :: Freeware',
'Operating System :: POSIX',
'Operating System :: Microsoft :: Windows',
'Operating System :: MacOS :: MacOS X',
'Topic :: Education',
'Programming Language :: Python',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8'],
packages=find_packages(exclude=['docs', 'tests']),
keywords='google translate translator',
install_requires=[
'httpx==0.13.3',
],
python_requires= '>=3.6',
tests_require=[
'pytest',
'coveralls',
],
scripts=['translate']
)
if __name__ == "__main__":
install()
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import sys
from googletrans import Translator
def main():
parser = argparse.ArgumentParser(
description='Python Google Translator as a command-line tool')
parser.add_argument('text', help='The text you want to translate.')
parser.add_argument('-d', '--dest', default='en',
help='The destination language you want to translate. (Default: en)')
parser.add_argument('-s', '--src', default='auto',
help='The source language you want to translate. (Default: auto)')
parser.add_argument('-c', '--detect', action='store_true', default=False,
help='')
args = parser.parse_args()
translator = Translator()
if args.detect:
result = translator.detect(args.text)
result = """
[{lang}, {confidence}] {text}
""".strip().format(text=args.text,
lang=result.lang, confidence=result.confidence)
print(result)
return
result = translator.translate(args.text, dest=args.dest, src=args.src)
result = u"""
[{src}] {original}
->
[{dest}] {text}
[pron.] {pronunciation}
""".strip().format(src=result.src, dest=result.dest, original=result.origin,
text=result.text, pronunciation=result.pronunciation)
print(result)
if __name__ == '__main__':
main()
Copyright 2014-2015 Michal "Mimino" Danilak
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
include README.md
include LICENSE
include NOTICE
include MANIFEST.in
include requirements.txt
include langdetect/utils/messages.properties
recursive-include langdetect/profiles *
language-detection license
==========================
Copyright (c) 2010-2014 Cybozu Labs, Inc. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
Metadata-Version: 2.1
Name: langdetect
Version: 1.0.9
Summary: Language detection library ported from Google's language-detection.
Home-page: https://github.com/Mimino666/langdetect
Author: Michal Mimino Danilak
Author-email: michal.danilak@gmail.com
License: MIT
Description: langdetect
==========
[![Build Status](https://travis-ci.org/Mimino666/langdetect.svg?branch=master)](https://travis-ci.org/Mimino666/langdetect)
Port of Nakatani Shuyo's [language-detection](https://github.com/shuyo/language-detection) library (version from 03/03/2014) to Python.
Installation
============
$ pip install langdetect
Supported Python versions 2.7, 3.4+.
Languages
=========
``langdetect`` supports 55 languages out of the box ([ISO 639-1 codes](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)):
af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he,
hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl,
pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh-cn, zh-tw
Basic usage
===========
To detect the language of the text:
```python
>>> from langdetect import detect
>>> detect("War doesn't show who's right, just who's left.")
'en'
>>> detect("Ein, zwei, drei, vier")
'de'
```
To find out the probabilities for the top languages:
```python
>>> from langdetect import detect_langs
>>> detect_langs("Otec matka syn.")
[sk:0.572770823327, pl:0.292872522702, cs:0.134356653968]
```
**NOTE**
Language detection algorithm is non-deterministic, which means that if you try to run it on a text which is either too short or too ambiguous, you might get different results everytime you run it.
To enforce consistent results, call following code before the first language detection:
```python
from langdetect import DetectorFactory
DetectorFactory.seed = 0
```
How to add new language?
========================
You need to create a new language profile. The easiest way to do it is to use the [langdetect.jar](https://github.com/shuyo/language-detection/raw/master/lib/langdetect.jar) tool, which can generate language profiles from Wikipedia abstract database files or plain text.
Wikipedia abstract database files can be retrieved from "Wikipedia Downloads" ([http://download.wikimedia.org/](http://download.wikimedia.org/)). They form '(language code)wiki-(version)-abstract.xml' (e.g. 'enwiki-20101004-abstract.xml' ).
usage: ``java -jar langdetect.jar --genprofile -d [directory path] [language codes]``
- Specify the directory which has abstract databases by -d option.
- This tool can handle gzip compressed file.
Remark: The database filename in Chinese is like 'zhwiki-(version)-abstract-zh-cn.xml' or zhwiki-(version)-abstract-zh-tw.xml', so that it must be modified 'zh-cnwiki-(version)-abstract.xml' or 'zh-twwiki-(version)-abstract.xml'.
To generate language profile from a plain text, use the genprofile-text command.
usage: ``java -jar langdetect.jar --genprofile-text -l [language code] [text file path]``
For more details see [language-detection Wiki](https://code.google.com/archive/p/language-detection/wikis/Tools.wiki).
Original project
================
This library is a direct port of Google's [language-detection](https://code.google.com/p/language-detection/) library from Java to Python. All the classes and methods are unchanged, so for more information see the project's website or wiki.
Presentation of the language detection algorithm: [http://www.slideshare.net/shuyo/language-detection-library-for-java](http://www.slideshare.net/shuyo/language-detection-library-for-java).
Keywords: language detection library
Platform: UNKNOWN
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: Apache Software License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python :: 2
Classifier: Programming Language :: Python :: 2.7
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.4
Classifier: Programming Language :: Python :: 3.5
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Description-Content-Type: text/markdown
langdetect
==========
[![Build Status](https://travis-ci.org/Mimino666/langdetect.svg?branch=master)](https://travis-ci.org/Mimino666/langdetect)
Port of Nakatani Shuyo's [language-detection](https://github.com/shuyo/language-detection) library (version from 03/03/2014) to Python.
Installation
============
$ pip install langdetect
Supported Python versions 2.7, 3.4+.
Languages
=========
``langdetect`` supports 55 languages out of the box ([ISO 639-1 codes](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)):
af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he,
hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl,
pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh-cn, zh-tw
Basic usage
===========
To detect the language of the text:
```python
>>> from langdetect import detect
>>> detect("War doesn't show who's right, just who's left.")
'en'
>>> detect("Ein, zwei, drei, vier")
'de'
```
To find out the probabilities for the top languages:
```python
>>> from langdetect import detect_langs
>>> detect_langs("Otec matka syn.")
[sk:0.572770823327, pl:0.292872522702, cs:0.134356653968]
```
**NOTE**
Language detection algorithm is non-deterministic, which means that if you try to run it on a text which is either too short or too ambiguous, you might get different results everytime you run it.
To enforce consistent results, call following code before the first language detection:
```python
from langdetect import DetectorFactory
DetectorFactory.seed = 0
```
How to add new language?
========================
You need to create a new language profile. The easiest way to do it is to use the [langdetect.jar](https://github.com/shuyo/language-detection/raw/master/lib/langdetect.jar) tool, which can generate language profiles from Wikipedia abstract database files or plain text.
Wikipedia abstract database files can be retrieved from "Wikipedia Downloads" ([http://download.wikimedia.org/](http://download.wikimedia.org/)). They form '(language code)wiki-(version)-abstract.xml' (e.g. 'enwiki-20101004-abstract.xml' ).
usage: ``java -jar langdetect.jar --genprofile -d [directory path] [language codes]``
- Specify the directory which has abstract databases by -d option.
- This tool can handle gzip compressed file.
Remark: The database filename in Chinese is like 'zhwiki-(version)-abstract-zh-cn.xml' or zhwiki-(version)-abstract-zh-tw.xml', so that it must be modified 'zh-cnwiki-(version)-abstract.xml' or 'zh-twwiki-(version)-abstract.xml'.
To generate language profile from a plain text, use the genprofile-text command.
usage: ``java -jar langdetect.jar --genprofile-text -l [language code] [text file path]``
For more details see [language-detection Wiki](https://code.google.com/archive/p/language-detection/wikis/Tools.wiki).
Original project
================
This library is a direct port of Google's [language-detection](https://code.google.com/p/language-detection/) library from Java to Python. All the classes and methods are unchanged, so for more information see the project's website or wiki.
Presentation of the language detection algorithm: [http://www.slideshare.net/shuyo/language-detection-library-for-java](http://www.slideshare.net/shuyo/language-detection-library-for-java).
Metadata-Version: 2.1
Name: langdetect
Version: 1.0.9
Summary: Language detection library ported from Google's language-detection.
Home-page: https://github.com/Mimino666/langdetect
Author: Michal Mimino Danilak
Author-email: michal.danilak@gmail.com
License: MIT
Description: langdetect
==========
[![Build Status](https://travis-ci.org/Mimino666/langdetect.svg?branch=master)](https://travis-ci.org/Mimino666/langdetect)
Port of Nakatani Shuyo's [language-detection](https://github.com/shuyo/language-detection) library (version from 03/03/2014) to Python.
Installation
============
$ pip install langdetect
Supported Python versions 2.7, 3.4+.
Languages
=========
``langdetect`` supports 55 languages out of the box ([ISO 639-1 codes](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes)):
af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he,
hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl,
pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh-cn, zh-tw
Basic usage
===========
To detect the language of the text:
```python
>>> from langdetect import detect
>>> detect("War doesn't show who's right, just who's left.")
'en'
>>> detect("Ein, zwei, drei, vier")
'de'
```
To find out the probabilities for the top languages:
```python
>>> from langdetect import detect_langs
>>> detect_langs("Otec matka syn.")
[sk:0.572770823327, pl:0.292872522702, cs:0.134356653968]
```
**NOTE**
Language detection algorithm is non-deterministic, which means that if you try to run it on a text which is either too short or too ambiguous, you might get different results everytime you run it.
To enforce consistent results, call following code before the first language detection:
```python
from langdetect import DetectorFactory
DetectorFactory.seed = 0
```
How to add new language?
========================
You need to create a new language profile. The easiest way to do it is to use the [langdetect.jar](https://github.com/shuyo/language-detection/raw/master/lib/langdetect.jar) tool, which can generate language profiles from Wikipedia abstract database files or plain text.
Wikipedia abstract database files can be retrieved from "Wikipedia Downloads" ([http://download.wikimedia.org/](http://download.wikimedia.org/)). They form '(language code)wiki-(version)-abstract.xml' (e.g. 'enwiki-20101004-abstract.xml' ).
usage: ``java -jar langdetect.jar --genprofile -d [directory path] [language codes]``
- Specify the directory which has abstract databases by -d option.
- This tool can handle gzip compressed file.
Remark: The database filename in Chinese is like 'zhwiki-(version)-abstract-zh-cn.xml' or zhwiki-(version)-abstract-zh-tw.xml', so that it must be modified 'zh-cnwiki-(version)-abstract.xml' or 'zh-twwiki-(version)-abstract.xml'.
To generate language profile from a plain text, use the genprofile-text command.
usage: ``java -jar langdetect.jar --genprofile-text -l [language code] [text file path]``
For more details see [language-detection Wiki](https://code.google.com/archive/p/language-detection/wikis/Tools.wiki).
Original project
================
This library is a direct port of Google's [language-detection](https://code.google.com/p/language-detection/) library from Java to Python. All the classes and methods are unchanged, so for more information see the project's website or wiki.
Presentation of the language detection algorithm: [http://www.slideshare.net/shuyo/language-detection-library-for-java](http://www.slideshare.net/shuyo/language-detection-library-for-java).
Keywords: language detection library
Platform: UNKNOWN
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved :: Apache Software License
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python :: 2
Classifier: Programming Language :: Python :: 2.7
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.4
Classifier: Programming Language :: Python :: 3.5
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Description-Content-Type: text/markdown
LICENSE
MANIFEST.in
NOTICE
README.md
requirements.txt
setup.py
langdetect/__init__.py
langdetect/detector.py
langdetect/detector_factory.py
langdetect/lang_detect_exception.py
langdetect/language.py
langdetect.egg-info/PKG-INFO
langdetect.egg-info/SOURCES.txt
langdetect.egg-info/dependency_links.txt
langdetect.egg-info/requires.txt
langdetect.egg-info/top_level.txt
langdetect/profiles/af
langdetect/profiles/ar
langdetect/profiles/bg
langdetect/profiles/bn
langdetect/profiles/ca
langdetect/profiles/cs
langdetect/profiles/cy
langdetect/profiles/da
langdetect/profiles/de
langdetect/profiles/el
langdetect/profiles/en
langdetect/profiles/es
langdetect/profiles/et
langdetect/profiles/fa
langdetect/profiles/fi
langdetect/profiles/fr
langdetect/profiles/gu
langdetect/profiles/he
langdetect/profiles/hi
langdetect/profiles/hr
langdetect/profiles/hu
langdetect/profiles/id
langdetect/profiles/it
langdetect/profiles/ja
langdetect/profiles/kn
langdetect/profiles/ko
langdetect/profiles/lt
langdetect/profiles/lv
langdetect/profiles/mk
langdetect/profiles/ml
langdetect/profiles/mr
langdetect/profiles/ne
langdetect/profiles/nl
langdetect/profiles/no
langdetect/profiles/pa
langdetect/profiles/pl
langdetect/profiles/pt
langdetect/profiles/ro
langdetect/profiles/ru
langdetect/profiles/sk
langdetect/profiles/sl
langdetect/profiles/so
langdetect/profiles/sq
langdetect/profiles/sv
langdetect/profiles/sw
langdetect/profiles/ta
langdetect/profiles/te
langdetect/profiles/th
langdetect/profiles/tl
langdetect/profiles/tr
langdetect/profiles/uk
langdetect/profiles/ur
langdetect/profiles/vi
langdetect/profiles/zh-cn
langdetect/profiles/zh-tw
langdetect/tests/__init__.py
langdetect/tests/test_detector.py
langdetect/tests/test_language.py
langdetect/utils/__init__.py
langdetect/utils/lang_profile.py
langdetect/utils/messages.properties
langdetect/utils/messages.py
langdetect/utils/ngram.py
langdetect/utils/unicode_block.py
\ No newline at end of file
from .detector_factory import DetectorFactory, PROFILES_DIRECTORY, detect, detect_langs
from .lang_detect_exception import LangDetectException
import random
import re
import six
from six.moves import zip, xrange
from .lang_detect_exception import ErrorCode, LangDetectException
from .language import Language
from .utils.ngram import NGram
from .utils.unicode_block import unicode_block
class Detector(object):
'''
Detector class is to detect language from specified text.
Its instance is able to be constructed via the factory class DetectorFactory.
After appending a target text to the Detector instance with .append(string),
the detector provides the language detection results for target text via .detect() or .get_probabilities().
.detect() method returns a single language name which has the highest probability.
.get_probabilities() methods returns a list of multiple languages and their probabilities.
The detector has some parameters for language detection.
See set_alpha(double), .set_max_text_length(int) .set_prior_map(dict).
Example:
from langdetect.detector_factory import DetectorFactory
factory = DetectorFactory()
factory.load_profile('/path/to/profile/directory')
def detect(text):
detector = factory.create()
detector.append(text)
return detector.detect()
def detect_langs(text):
detector = factory.create()
detector.append(text)
return detector.get_probabilities()
'''
ALPHA_DEFAULT = 0.5
ALPHA_WIDTH = 0.05
ITERATION_LIMIT = 1000
PROB_THRESHOLD = 0.1
CONV_THRESHOLD = 0.99999
BASE_FREQ = 10000
UNKNOWN_LANG = 'unknown'
URL_RE = re.compile(r'https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}')
MAIL_RE = re.compile(r'[-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}')
def __init__(self, factory):
self.word_lang_prob_map = factory.word_lang_prob_map
self.langlist = factory.langlist
self.seed = factory.seed
self.random = random.Random()
self.text = ''
self.langprob = None
self.alpha = self.ALPHA_DEFAULT
self.n_trial = 7
self.max_text_length = 10000
self.prior_map = None
self.verbose = False
def set_verbose(self):
self.verbose = True
def set_alpha(self, alpha):
self.alpha = alpha
def set_prior_map(self, prior_map):
'''Set prior information about language probabilities.'''
self.prior_map = [0.0] * len(self.langlist)
sump = 0.0
for i in xrange(len(self.prior_map)):
lang = self.langlist[i]
if lang in prior_map:
p = prior_map[lang]
if p < 0:
raise LangDetectException(ErrorCode.InitParamError, 'Prior probability must be non-negative.')
self.prior_map[i] = p
sump += p
if sump <= 0.0:
raise LangDetectException(ErrorCode.InitParamError, 'More one of prior probability must be non-zero.')
for i in xrange(len(self.prior_map)):
self.prior_map[i] /= sump
def set_max_text_length(self, max_text_length):
'''Specify max size of target text to use for language detection.
The default value is 10000(10KB).
'''
self.max_text_length = max_text_length
def append(self, text):
'''Append the target text for language detection.
If the total size of target text exceeds the limit size specified by
Detector.set_max_text_length(int), the rest is cut down.
'''
text = self.URL_RE.sub(' ', text)
text = self.MAIL_RE.sub(' ', text)
text = NGram.normalize_vi(text)
pre = 0
for i in xrange(min(len(text), self.max_text_length)):
ch = text[i]
if ch != ' ' or pre != ' ':
self.text += ch
pre = ch
def cleaning_text(self):
'''Cleaning text to detect
(eliminate URL, e-mail address and Latin sentence if it is not written in Latin alphabet).
'''
latin_count, non_latin_count = 0, 0
for ch in self.text:
if 'A' <= ch <= 'z':
latin_count += 1
elif ch >= six.u('\u0300') and unicode_block(ch) != 'Latin Extended Additional':
non_latin_count += 1
if latin_count * 2 < non_latin_count:
text_without_latin = ''
for ch in self.text:
if ch < 'A' or 'z' < ch:
text_without_latin += ch
self.text = text_without_latin
def detect(self):
'''Detect language of the target text and return the language name
which has the highest probability.
'''
probabilities = self.get_probabilities()
if probabilities:
return probabilities[0].lang
return self.UNKNOWN_LANG
def get_probabilities(self):
if self.langprob is None:
self._detect_block()
return self._sort_probability(self.langprob)
def _detect_block(self):
self.cleaning_text()
ngrams = self._extract_ngrams()
if not ngrams:
raise LangDetectException(ErrorCode.CantDetectError, 'No features in text.')
self.langprob = [0.0] * len(self.langlist)
self.random.seed(self.seed)
for t in xrange(self.n_trial):
prob = self._init_probability()
alpha = self.alpha + self.random.gauss(0.0, 1.0) * self.ALPHA_WIDTH
i = 0
while True:
self._update_lang_prob(prob, self.random.choice(ngrams), alpha)
if i % 5 == 0:
if self._normalize_prob(prob) > self.CONV_THRESHOLD or i >= self.ITERATION_LIMIT:
break
if self.verbose:
six.print_('>', self._sort_probability(prob))
i += 1
for j in xrange(len(self.langprob)):
self.langprob[j] += prob[j] / self.n_trial
if self.verbose:
six.print_('==>', self._sort_probability(prob))
def _init_probability(self):
'''Initialize the map of language probabilities.
If there is the specified prior map, use it as initial map.
'''
if self.prior_map is not None:
return list(self.prior_map)
else:
return [1.0 / len(self.langlist)] * len(self.langlist)
def _extract_ngrams(self):
'''Extract n-grams from target text.'''
RANGE = list(xrange(1, NGram.N_GRAM + 1))
result = []
ngram = NGram()
for ch in self.text:
ngram.add_char(ch)
if ngram.capitalword:
continue
for n in RANGE:
# optimized w = ngram.get(n)
if len(ngram.grams) < n:
break
w = ngram.grams[-n:]
if w and w != ' ' and w in self.word_lang_prob_map:
result.append(w)
return result
def _update_lang_prob(self, prob, word, alpha):
'''Update language probabilities with N-gram string(N=1,2,3).'''
if word is None or word not in self.word_lang_prob_map:
return False
lang_prob_map = self.word_lang_prob_map[word]
if self.verbose:
six.print_('%s(%s): %s' % (word, self._unicode_encode(word), self._word_prob_to_string(lang_prob_map)))
weight = alpha / self.BASE_FREQ
for i in xrange(len(prob)):
prob[i] *= weight + lang_prob_map[i]
return True
def _word_prob_to_string(self, prob):
result = ''
for j in xrange(len(prob)):
p = prob[j]
if p >= 0.00001:
result += ' %s:%.5f' % (self.langlist[j], p)
return result
def _normalize_prob(self, prob):
'''Normalize probabilities and check convergence by the maximun probability.
'''
maxp, sump = 0.0, sum(prob)
for i in xrange(len(prob)):
p = prob[i] / sump
if maxp < p:
maxp = p
prob[i] = p
return maxp
def _sort_probability(self, prob):
result = [Language(lang, p) for (lang, p) in zip(self.langlist, prob) if p > self.PROB_THRESHOLD]
result.sort(reverse=True)
return result
def _unicode_encode(self, word):
buf = ''
for ch in word:
if ch >= six.u('\u0080'):
st = hex(0x10000 + ord(ch))[2:]
while len(st) < 4:
st = '0' + st
buf += r'\u' + st[1:5]
else:
buf += ch
return buf
import os
from os import path
import sys
try:
import simplejson as json
except ImportError:
import json
from .detector import Detector
from .lang_detect_exception import ErrorCode, LangDetectException
from .utils.lang_profile import LangProfile
class DetectorFactory(object):
'''
Language Detector Factory Class.
This class manages an initialization and constructions of Detector.
Before using language detection library,
load profiles with DetectorFactory.load_profile(str)
and set initialization parameters.
When the language detection,
construct Detector instance via DetectorFactory.create().
See also Detector's sample code.
'''
seed = None
def __init__(self):
self.word_lang_prob_map = {}
self.langlist = []
def load_profile(self, profile_directory):
list_files = os.listdir(profile_directory)
if not list_files:
raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Not found profile: ' + profile_directory)
langsize, index = len(list_files), 0
for filename in list_files:
if filename.startswith('.'):
continue
filename = path.join(profile_directory, filename)
if not path.isfile(filename):
continue
f = None
try:
if sys.version_info[0] < 3:
f = open(filename, 'r')
else:
f = open(filename, 'r', encoding='utf-8')
json_data = json.load(f)
profile = LangProfile(**json_data)
self.add_profile(profile, index, langsize)
index += 1
except IOError:
raise LangDetectException(ErrorCode.FileLoadError, 'Cannot open "%s"' % filename)
except:
raise LangDetectException(ErrorCode.FormatError, 'Profile format error in "%s"' % filename)
finally:
if f:
f.close()
def load_json_profile(self, json_profiles):
langsize, index = len(json_profiles), 0
if langsize < 2:
raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Need more than 2 profiles.')
for json_profile in json_profiles:
try:
json_data = json.loads(json_profile)
profile = LangProfile(**json_data)
self.add_profile(profile, index, langsize)
index += 1
except:
raise LangDetectException(ErrorCode.FormatError, 'Profile format error.')
def add_profile(self, profile, index, langsize):
lang = profile.name
if lang in self.langlist:
raise LangDetectException(ErrorCode.DuplicateLangError, 'Duplicate the same language profile.')
self.langlist.append(lang)
for word in profile.freq:
if word not in self.word_lang_prob_map:
self.word_lang_prob_map[word] = [0.0] * langsize
length = len(word)
if 1 <= length <= 3:
prob = 1.0 * profile.freq.get(word) / profile.n_words[length - 1]
self.word_lang_prob_map[word][index] = prob
def clear(self):
self.langlist = []
self.word_lang_prob_map = {}
def create(self, alpha=None):
'''Construct Detector instance with smoothing parameter.'''
detector = self._create_detector()
if alpha is not None:
detector.set_alpha(alpha)
return detector
def _create_detector(self):
if not self.langlist:
raise LangDetectException(ErrorCode.NeedLoadProfileError, 'Need to load profiles.')
return Detector(self)
def set_seed(self, seed):
self.seed = seed
def get_lang_list(self):
return list(self.langlist)
PROFILES_DIRECTORY = path.join(path.dirname(__file__), 'profiles')
_factory = None
def init_factory():
global _factory
if _factory is None:
_factory = DetectorFactory()
_factory.load_profile(PROFILES_DIRECTORY)
def detect(text):
init_factory()
detector = _factory.create()
detector.append(text)
return detector.detect()
def detect_langs(text):
init_factory()
detector = _factory.create()
detector.append(text)
return detector.get_probabilities()
_error_codes = {
'NoTextError': 0,
'FormatError': 1,
'FileLoadError': 2,
'DuplicateLangError': 3,
'NeedLoadProfileError': 4,
'CantDetectError': 5,
'CantOpenTrainData': 6,
'TrainDataFormatError': 7,
'InitParamError': 8,
}
ErrorCode = type('ErrorCode', (), _error_codes)
class LangDetectException(Exception):
def __init__(self, code, message):
super(LangDetectException, self).__init__(message)
self.code = code
def get_code(self):
return self.code
class Language(object):
'''
Language is to store the detected language.
Detector.get_probabilities() returns a list of Languages.
'''
def __init__(self, lang, prob):
self.lang = lang
self.prob = prob
def __repr__(self):
if self.lang is None:
return ''
return '%s:%s' % (self.lang, self.prob)
def __lt__(self, other):
return self.prob < other.prob
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment