Commit b6f15452 authored by Atrax Nicolas's avatar Atrax Nicolas

Update pages

parent 555164b4
[[pages]]
path = "Welcome.py"
name = "Home"
path = "Homepage.py"
name = "Homepage"
icon = ":house:"
[[pages]]
......
......@@ -4,13 +4,13 @@ Loïc Chapron
"""
import streamlit as st
import src.basic as tmp
import pandas as pd
import src.basic as tmp
tmp.base("Welcome")
tmp.base("Homepage")
st.write(st.session_state.general_text_dict['welcome'])
st.write(st.session_state.general_text_dict['tools'])
st.write(st.session_state.general_text_dict['code'])
st.write(st.session_state.general_text_dict['help'])
......@@ -29,8 +29,8 @@ en,watermark,"Watermark : "
fr,submit," Soumettre "
en,submit,"Submit "
fr,loading," Conversion du pdf en cours "
en,loading," Processing pdf conversion "
fr,loading," Conversion du PDF en cours "
en,loading," Processing PDF conversion "
fr,warning,"Attention ! Plusieurs langues ont été détectées pour la source : "
fr,warning2,"Les langues suivantes ont été détectées : "
......
......@@ -5,11 +5,11 @@ en,title,"# TXT To TSV"
fr,text,"Convertit un fichier TXT en un fichier TSV compatible avec Gargantext"
en,text,"Convert a TXT file into a TSV file compatible with GarganText"
fr,text2,"Cet outil détecte automatiquement les langues présentes au sein des PDF à l'aide de l'API Google Translate."
en,text2,"This tool detect automatically the languages of the PDF with the Google Translate API."
fr,text2,"Convertit un ZIP de fichiers TXT en fichiers TSV compatibles avec Gargantext"
en,text2,"Convert a ZIP of TXT files into TSV files compatible with GarganText"
fr,text3,"Vous pouvez choisir le titre et le(s) auteur(s) et indiquer, s'il existe, le filigrane de ce PDF."
en,text3,"You can choose the title and the author(s) and specify, if it does exist, the watermark for this PDF."
fr,text3,"Vous pouvez choisir le titre et le(s) auteur(s) de ce TXT."
en,text3,"You can choose the title and the author(s) for this TXT."
fr,file,"Choisir un fichier"
en,file,"Choose a file"
......@@ -20,12 +20,15 @@ en,new_file,"Download your TSV file : "
fr,author,"Auteur(s) : "
en,author,"Author(s) : "
fr,titlePDF,"Titre : "
en,titlePDF,"Title : "
fr,titleTXT,"Titre : "
en,titleTXT,"Title : "
fr,submit," Soumettre "
en,submit,"Submit "
fr,loading," Conversion du TXT en cours "
en,loading," Processing TXT conversion "
fr,warning,"Attention ! Plusieurs langues ont été détectées pour la source : "
fr,warning2,"Les langues suivantes ont été détectées : "
en,warning,"Warning ! Multiple languages have been detected at the source : "
......
......@@ -141,4 +141,4 @@ if st.session_state.page == 1:
tsv = HarzingToTsv(separator)
name = st.session_state.file.name.split('.')[0] + '.tsv'
st.write(st.session_state.general_text_dict['new_file'])
st.download_button(name, tsv, name, on_click=resetPage())
st.download_button(name, tsv, name, on_click=resetPage)
......@@ -151,7 +151,7 @@ def segmentAbstract(fileName, fileAddress, tsv, author, source, year, month, day
languages = detectLanguages(doc, languages)
st.session_state.pdfLanguages[fileName] = detectMultipleLanguages(
languages, fileName)
return tsv, languages
return tsv
def correctedSequence(text, last):
......@@ -173,21 +173,21 @@ def getInfo():
return st.session_state.author, title, st.session_state.watermark
def txtToTSV(fileName, fileAddress, pdfDir):
def pdfToTSV(fileName, fileAddress, pdfDir):
st.session_state.page = 1
author, title, watermark = getInfo()
tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract\n"
with st.spinner(st.session_state.general_text_dict['loading']):
tsv, languages = segmentAbstract(fileName, fileAddress, tsv, author, title,
str(date.today().year), "1", "1", watermark)
tsv = segmentAbstract(fileName, fileAddress, tsv, author, title,
str(date.today().year), "1", "1", watermark)
if '/' in fileName:
fileName = fileName.split('/')[1]
with open(pdfDir + "/" + fileName.replace(".pdf", ".tsv"), "w", encoding="utf-8-sig") as file:
with open(pdfDir + "/" + fileName.replace(".pdf", "(pdf).tsv"), "w", encoding="utf-8-sig") as file:
file.write(tsv)
tsv = "\n".join(tsv.split("\n")[1:])
return tsv, languages
return tsv
def extractAllPDF(zipDir, zipFile):
......@@ -223,6 +223,10 @@ def setSubmit():
st.session_state.submit = True
def resetPage():
st.session_state.page = 0
def upPage():
st.session_state.page = 2
......@@ -234,7 +238,7 @@ def uploadZip():
st.file_uploader(
st.session_state.general_text_dict['file'], type=["zip"], key='file')
st.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=setSubmit())
st.session_state.general_text_dict['submit'], on_click=setSubmit)
def askPDF(fileName):
......@@ -254,7 +258,7 @@ def askPDF(fileName):
st.text_input(
st.session_state.general_text_dict['watermark'], key='watermark')
st.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=upPage())
st.session_state.general_text_dict['submit'], on_click=upPage)
# Page Code End
......@@ -279,7 +283,7 @@ if st.session_state.page == 0:
if st.session_state.page == 2:
fileName = st.session_state.fileName
tmp, languages = txtToTSV(
tmp = pdfToTSV(
fileName, st.session_state.zipDir.name + '/' + fileName, st.session_state.pdfDir.name)
st.session_state.tsv += "\n" + tmp
if st.session_state.nbDoc == st.session_state.len - 1:
......@@ -306,4 +310,4 @@ if st.session_state.page == 3:
detectMultiplePdfLanguages()
st.write(st.session_state.general_text_dict['new_file'])
st.download_button("PDFCompilation.zip",
zip, "PDFCompilation.zip")
zip, "PDFCompilation.zip", on_click=resetPage)
......@@ -97,7 +97,7 @@ def askPDF():
st.file_uploader(
st.session_state.general_text_dict['file'], type=["pdf"], key='file')
st.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=setSubmit())
st.session_state.general_text_dict['submit'], on_click=setSubmit)
# Page Code End
......@@ -124,4 +124,4 @@ if st.session_state.page == 1:
name = st.session_state.file.name.split('.')[0] + '.txt'
st.write(st.session_state.general_text_dict['new_file'])
st.session_state.submit = False
st.download_button(name, txt, name, on_click=setPage())
st.download_button(name, txt, name, on_click=setPage)
......@@ -43,12 +43,13 @@ def estimateLanguagesPercentage(languages):
for l in languages:
total += languages[l]
for l in languages:
tmp = (languages[l] / total) * 100
tmp = round((languages[l] / total) * 100, 1)
if tmp >= 15:
res[l] = tmp
if st.session_state.detected != "":
st.session_state.detected += "| "
st.session_state.detected += l + " : " + str(tmp) + "%"
print(res)
return res
......@@ -182,10 +183,11 @@ def uploadTSV():
with st.form("Detect"):
st.write(st.session_state.general_text_dict['text'])
st.write(st.session_state.general_text_dict['text2'])
st.file_uploader(
st.session_state.general_text_dict['file'], type=["tsv", "csv"], key='file')
st.form_submit_button(
st.session_state.general_text_dict['detect'], on_click=setDetect())
st.session_state.general_text_dict['detect'], on_click=setDetect)
def askTranslateLanguages(file):
......@@ -202,7 +204,7 @@ def askTranslateLanguages(file):
st.selectbox(st.session_state.general_text_dict['translate2'], st.session_state.languages,
key='destLang')
st.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=setSubmit())
st.session_state.general_text_dict['submit'], on_click=setSubmit)
# Page Code End
......@@ -239,11 +241,11 @@ if st.session_state.page == 2:
st.write(st.session_state.general_text_dict['new_file'])
name = st.session_state.tmpFile.name
st.download_button(name,
tsv, name, on_click=resetPage())
tsv, name, on_click=resetPage)
if st.session_state.page == 3:
st.write(
st.session_state.general_text_dict['sameLanguages'] + list(st.session_state.languages.keys())[0])
st.session_state.languages = {}
st.button(
st.session_state.general_text_dict['anotherFile'], on_click=resetPage())
st.session_state.general_text_dict['anotherFile'], on_click=resetPage)
......@@ -4,13 +4,12 @@ Nicolas Atrax
"""
import streamlit as st
import pandas as pd
import chardet
import zipfile
import tempfile
import shutil
import os
import re
from datetime import date
import codecs
import os
import tempfile
from lib.langdetect.langdetect import detect
from lib.langdetect.langdetect.lang_detect_exception import LangDetectException
import src.basic as tmp
......@@ -70,6 +69,18 @@ def detectMultipleLanguages(languages, fileName):
return principal
def detectMultipleTxtLanguages():
languages = []
for l in st.session_state.txtLanguages.values():
if l not in languages and len(languages) == 1:
st.info(st.session_state.general_text_dict['globalWarning'])
st.info(str(st.session_state.txtLanguages))
st.info(st.session_state.general_text_dict['advice'])
return
if len(languages) == 0:
languages.append(l)
def segmentAbstract(abstract, tsv):
year = str(date.today().year)
month = "1"
......@@ -101,6 +112,35 @@ def segmentAbstract(abstract, tsv):
return tsv
def segmentAbstract2(abstract, tsv, author, title):
year = str(date.today().year)
month = "1"
day = "1"
source = title
nbLines = len(abstract)
n = 0
count = 1
languages = {}
while n < nbLines - 2:
doc = "".join(abstract[n: n + 9]).replace("�", "")
title = source + " : Part " + str(count)
tsv += correctedSequence(author, False) + "\t" + correctedSequence(
source, False) + "\t" + year + "\t" + month + "\t" + day + "\t"
tsv += correctedSequence(title, False) + "\t"
tsv += correctedSequence(doc, True)
if tsv[-1] != "\n":
tsv += "\n"
n += 7
count += 1
if n > nbLines - 9 and n != nbLines - 2:
n = nbLines - 9
languages = detectLanguages(doc, languages)
st.session_state.txtLanguages[fileName] = detectMultipleLanguages(
languages, source)
return tsv
def correctedSequence(text, last):
tmp = text.replace("\"", "\"\"")
find = "\t" in text or "\"" in text or "\n" in text
......@@ -115,16 +155,31 @@ def correctedSequence(text, last):
def getTxt():
txt = []
st.session_state.pdfDir = tempfile.TemporaryDirectory()
st.session_state.tmpDir = tempfile.TemporaryDirectory()
name = st.session_state.file.name
with open(st.session_state.pdfDir.name + "/" + name, "wb") as file:
with open(st.session_state.tmpDir.name + "/" + name, "wb") as file:
file.write(st.session_state.file.getvalue())
with open(st.session_state.pdfDir.name + "/" + name, "r") as file:
with open(st.session_state.tmpDir.name + "/" + name, "r") as file:
for line in file:
txt.append(line)
return txt
def getTxt2(fileAddress):
txt = []
with open(fileAddress, "r") as file:
for line in file:
txt.append(line)
return txt
def getInfo():
title = st.session_state.title
if title == "":
title = st.session_state.fileName.replace(".txt", "")
return st.session_state.author, title
def txtToTSV():
fileName = st.session_state.file.name
......@@ -135,6 +190,32 @@ def txtToTSV():
return tsv
def txtToTSV2(fileName, fileAddress, txtDir):
st.session_state.page = 1
author, title = getInfo()
abstract = getTxt2(fileAddress)
tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract\n"
with st.spinner(st.session_state.general_text_dict['loading']):
tsv = segmentAbstract2(abstract, tsv, author, title)
if '/' in fileName:
fileName = fileName.split('/')[1]
with open(txtDir + "/" + fileName.replace(".txt", "(txt).tsv"), "w", encoding="utf-8-sig") as file:
file.write(tsv)
tsv = "\n".join(tsv.split("\n")[1:])
return tsv
def extractAllTXT(zipDir, zipFile):
with zipfile.ZipFile(zipFile) as zipRef:
zipInfos = zipRef.infolist()
for info in zipInfos:
while '/' in info.filename and len(info.filename.split('/')) > 1:
info.filename = "/".join(info.filename.split('/')[1:])
if ".txt" in info.filename:
zipRef.extract(info, zipDir)
# Tool Code End
......@@ -148,20 +229,45 @@ if 'page' not in st.session_state:
if 'warning' not in st.session_state:
st.session_state.warning = ""
if 'submit' not in st.session_state:
st.session_state.submit = False
if 'zipSubmit' not in st.session_state:
st.session_state.zipSubmit = False
if 'txtLanguages' not in st.session_state:
st.session_state.txtLanguages = {}
def setSubmit():
st.session_state.submit = True
def setPage():
def setZIPSubmit():
st.session_state.zipSubmit = True
def resetPage():
st.session_state.page = 0
def upPage():
st.session_state.page = 3
def uploadZip():
with st.form("Submit2"):
st.write(st.session_state.general_text_dict['text2'])
st.file_uploader(
st.session_state.general_text_dict['file'], type=["zip"], key='zipFile')
st.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=setZIPSubmit)
def askTXT():
with st.form("Submit"):
st.write(st.session_state.general_text_dict['text'])
st.write(st.session_state.general_text_dict['text2'])
st.write(st.session_state.general_text_dict['text3'])
col1, col2 = st.columns(2)
st.session_state.author = ""
......@@ -170,28 +276,58 @@ def askTXT():
st.session_state.general_text_dict['author'], key='author')
with col2:
st.text_input(
st.session_state.general_text_dict['titlePDF'], key='title')
st.session_state.general_text_dict['titleTXT'], key='title')
st.file_uploader(
st.session_state.general_text_dict['file'], type=["txt"], key='file')
st.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=setSubmit())
st.session_state.general_text_dict['submit'], on_click=setSubmit)
def askTXT2(fileName):
with st.form("Submit"):
st.write(fileName)
st.write(st.session_state.general_text_dict['text3'])
col1, col2 = st.columns(2)
st.session_state.author = ""
with col1:
st.text_input(
st.session_state.general_text_dict['author'], key='author')
with col2:
st.text_input(
st.session_state.general_text_dict['titleTXT'], key='title')
st.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=upPage)
# Page Code End
if 'submit' not in st.session_state:
st.session_state.submit = False
if st.session_state.page == 0:
if st.session_state.submit:
st.session_state.submit = False
if st.session_state.file != None:
print(st.session_state.file)
st.session_state.page = 1
else:
askTXT()
uploadZip()
elif st.session_state.zipSubmit:
st.session_state.zipSubmit = False
if st.session_state.zipFile != None:
st.session_state.zipDir = tempfile.TemporaryDirectory()
st.session_state.txtDir = tempfile.TemporaryDirectory()
st.session_state.tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract"
extractAllTXT(st.session_state.zipDir.name,
st.session_state.zipFile)
st.session_state.len = len(
os.listdir(st.session_state.zipDir.name))
st.session_state.nbDoc = 0
st.session_state.page = 2
else:
askTXT()
uploadZip()
else:
askTXT()
uploadZip()
if st.session_state.page == 1:
name = st.session_state.file.name
......@@ -201,4 +337,35 @@ if st.session_state.page == 1:
st.session_state.submit = False
if st.session_state.warning != "":
st.info(st.session_state.warning)
st.download_button(name, txt, name, on_click=setPage())
st.download_button(name, txt, name, on_click=resetPage)
if st.session_state.page == 3:
fileName = st.session_state.fileName
tmp = txtToTSV2(
fileName, st.session_state.zipDir.name + '/' + fileName, st.session_state.txtDir.name)
st.session_state.tsv += "\n" + tmp
if st.session_state.nbDoc == st.session_state.len - 1:
st.session_state.page = 4
else:
st.session_state.nbDoc += 1
st.session_state.page = 2
if st.session_state.page == 2:
fileName = os.listdir(st.session_state.zipDir.name)[st.session_state.nbDoc]
st.session_state.fileName = fileName
if '/' in fileName:
fileName = fileName.split('/')[1]
askTXT2(fileName)
if st.session_state.page == 4:
with open(st.session_state.txtDir.name + "/TXTCompilation.tsv", "w", encoding='utf-8-sig') as file:
file.write(st.session_state.tsv)
shutil.make_archive(st.session_state.zipDir.name +
"/TXTCompilation", 'zip', st.session_state.txtDir.name)
with open(st.session_state.zipDir.name + "/TXTCompilation.zip", 'rb') as zip:
if st.session_state.warning != "":
st.info(st.session_state.warning)
detectMultipleTxtLanguages()
st.write(st.session_state.general_text_dict['new_file'])
st.download_button("TXTCompilation.zip",
zip, "TXTCompilation.zip", on_click=resetPage)
......@@ -4,11 +4,6 @@ Nicolas Atrax
"""
import streamlit as st
import pandas as pd
import chardet
import re
import codecs
import os
import tempfile
import shutil
from datetime import date
......@@ -17,6 +12,8 @@ from lib.youtubetranscript.youtube_transcript_api import YouTubeTranscriptApi
from lib.youtubetranscript.youtube_transcript_api._transcripts import NoTranscriptFound
from lib.youtubetranscript.youtube_transcript_api._transcripts import TranscriptsDisabled
import src.basic as tmp
import time
import random
tmp.base("YTBtoTSV")
......@@ -24,7 +21,11 @@ tmp.base("YTBtoTSV")
def ytbSearch(search, n):
videosSearch = VideosSearch(search)
if st.session_state.videoLang == 'fr':
region = 'FR'
else:
region = 'US'
videosSearch = VideosSearch(search, region=region)
result = videosSearch.result()["result"]
videos = []
while len(videos) < n:
......@@ -35,8 +36,11 @@ def ytbSearch(search, n):
videos.append([id, author, title])
if len(videos) == n:
break
if len(videos) == n:
break
tmpResult = result
videosSearch.next()
time.sleep(1.0)
result = videosSearch.result()["result"]
if result == tmpResult:
break
......@@ -44,27 +48,37 @@ def ytbSearch(search, n):
def getLang(list):
tmp = ""
for lang in list:
return str(lang).split(" ")[0]
tmp = str(lang).split(" ")[0]
if tmp == st.session_state.videoLang:
break
return tmp
def translatedTranscript(lang, lst, title, manual):
if lang != "en":
res = lst.find_transcript([lang])
trans = res.translate("en").fetch()
return trans
return lst.find_transcript([lang]).fetch()
def translateTranscript(lst, lang):
origin = lst.find_transcript([lang])
manual = not origin.is_generated
return origin.translate(st.session_state.videoLang).fetch(), manual
def ytbTranscript(id, title):
try:
transcriptList = YouTubeTranscriptApi.list_transcripts(id)
lang = getLang(transcriptList)
if lang != st.session_state.videoLang:
return translateTranscript(transcriptList, lang)
try:
transcriptList.find_manually_created_transcript([lang])
return translatedTranscript(lang, transcriptList, title, True), True
transcript = transcriptList.find_manually_created_transcript(
[st.session_state.videoLang]).fetch()
return transcript, True
except NoTranscriptFound:
return translatedTranscript(lang, transcriptList, title, False), False
try:
transcript = transcriptList.find_generated_transcript(
[st.session_state.videoLang]).fetch()
return transcript, False
except NoTranscriptFound:
return None, False
except TranscriptsDisabled:
return None, False
......@@ -133,6 +147,12 @@ def transcriptManualToDoc(transcript, author, title, date):
else:
tmp += text + " "
time += float(part["duration"])
if time >= 20:
tsv = tsvAdd(tsv, tmp, author, title, date, count)
tmp = ""
time = 0
count += 1
with open(st.session_state.zipDir.name + "/" + title + ".tsv", "w", encoding="utf-8-sig") as file:
file.write(tsv)
tsv = "\n".join(tsv.split("\n")[1:])
......@@ -170,15 +190,19 @@ def transcriptToTsv(search, nbVideos):
dict = st.session_state.general_text_dict
with st.spinner(dict['loadingID']):
if st.session_state.manualOnly:
videos = ytbSearch(search, nbVideos * 20)
videos = ytbSearch(search, nbVideos * 15)
else:
videos = ytbSearch(search, nbVideos * 4)
videos = ytbSearch(search, nbVideos * 5)
count = 0
countManual = 0
countTotal = 0
bar = st.progress(count / nbVideos, dict['loading'] +
str(count) + dict['quantity'] + str(nbVideos))
for video in videos:
print(count)
countTotal += 1
waitingTime = random.uniform(2.0, 7.0)
# print("Waiting time : " + str(waitingTime))
time.sleep(waitingTime)
# print(countTotal)
bar.progress(count / nbVideos, dict['loading'] +
str(count) + dict['quantity'] + str(nbVideos))
if count == nbVideos:
......@@ -191,7 +215,6 @@ def transcriptToTsv(search, nbVideos):
continue
transcript = correctTranscript(transcript)
if manual:
countManual += 1
tsv += transcriptManualToDoc(transcript,
author, title, str(date.today().year))
count += 1
......@@ -226,14 +249,17 @@ def resetPage():
def askVideos():
with st.form("Submit"):
st.write(st.session_state.general_text_dict['text'])
st.write(st.session_state.general_text_dict['text2'])
st.selectbox(st.session_state.general_text_dict['videoLang'], ['fr', 'en'],
key='videoLang')
st.text_input(
st.session_state.general_text_dict['keywords'], key='keywords')
st.slider(
st.session_state.general_text_dict['number'], 1, 30, key='nb_taken')
st.session_state.general_text_dict['number'], 1, 20, key='nb_taken')
st.checkbox(
st.session_state.general_text_dict['fill'], key='manualOnly')
st.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=setSubmit())
st.session_state.general_text_dict['submit'], on_click=setSubmit)
# Page Code End
......@@ -258,4 +284,4 @@ if st.session_state.page == 1:
with open(compilName + ".zip", 'rb') as zip:
st.write(st.session_state.general_text_dict['new_file'])
st.download_button(st.session_state.keywords + ".zip",
zip, st.session_state.keywords + ".zip", on_click=resetPage())
zip, st.session_state.keywords + ".zip", on_click=resetPage)
......@@ -226,6 +226,7 @@ if st.session_state.stage == 0:
# Form
form = st.form('api')
lst = ['items', 'collections']
st.session_state.id = form.text_input(
'ID', st.session_state.id, key='idForm', help=st.session_state.general_text_dict['help'])
......
......@@ -6,7 +6,7 @@ from st_pages import show_pages_from_config, add_indentation
def base(page):
st.set_page_config(
page_title="GarganTools " + page,
page_title="GarganTools | " + page,
page_icon="img/isc-pif_logo.png",
)
......@@ -61,10 +61,11 @@ def base(page):
show_pages_from_config()
elif st.session_state.general_session_page != page:
st.session_state.general_text_dict = load_bundle(st.session_state.general_language)
st.session_state.general_text_dict = load_bundle(
st.session_state.general_language)
st.session_state.general_session_page = page
show_pages_from_config()
# Delete every key who aren't fron this file
for key in st.session_state.keys():
if 'general_' not in key:
......@@ -72,14 +73,12 @@ def base(page):
add_indentation()
# select the lang
coltitle,col = st.columns([4,1])
coltitle, col = st.columns([4, 1])
with coltitle:
st.write(st.session_state.general_text_dict['title'])
with col:
_,col1, col2 = st.columns([1,1,1])
_, col1, col2 = st.columns([1, 1, 1])
with col1:
st.button(':fr:', on_click=update_lang, args=('fr',))
with col2:
st.button(':us:', on_click=update_lang, args=('en',))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment