""" Streamlit Application Nicolas Atrax """ import shutil import zipfile import tempfile import os from datetime import date import re import pandas as pd import streamlit as st import lib.tika.tika as tika tika.initVM() from lib.tika.tika import parser from lib.langdetect.langdetect import detect from lib.langdetect.langdetect.lang_detect_exception import LangDetectException import src.basic as tmp os.environ['TIKA_SERVER_JAR'] = 'https://repo1.maven.org/maven2/org/apache/tika/tika-server/1.19/tika-server-1.19.jar' tmp.base("PDFtoTSV") # Tool Code Start def replaceNewlines(txt): tmp = txt.split("\n") res = "" for line in tmp: if line.replace(" ", "") == "": continue if res != "": res += '\n' res += line return res def getTextFromPDF(fileAddress): parsed_pdf = parser.from_file(fileAddress) data = parsed_pdf['content'] return replaceNewlines(data) def detectLanguages(abstract, languages): if re.search('[a-zA-Z]', abstract) == None: return languages try: tmp = detect(abstract) if tmp in languages: languages[tmp] += 1 else: languages[tmp] = 1 except LangDetectException: pass return languages def estimateLanguagesPercentage(languages): total = 0 max = 0 principal = "" res = {} for l in languages: total += int(languages[l]) for l in languages: tmp = round(int(languages[l]) / total * 100, 1) if tmp > max: max = tmp principal = l res[l] = tmp return res, principal def detectMultipleLanguages(languages, fileName): languages, principal = estimateLanguagesPercentage(languages) detected = "" valuable = [] for l in languages: if languages[l] >= 15: if detected != "": detected += "| " detected += l + " : " + str(languages[l]) + "%" valuable.append(l) if len(valuable) > 1: st.session_state.warning += st.session_state.general_text_dict['warning'] + "\"" + \ fileName + "\" !\n" st.session_state.warning += st.session_state.general_text_dict['warning2'] + \ detected + " \n" st.session_state.warning += st.session_state.general_text_dict['advice'] + "\n\n" return principal return principal def detectMultiplePdfLanguages(): languages = [] for l in st.session_state.pdfLanguages.values(): if l not in languages and len(languages) == 1: st.info(st.session_state.general_text_dict['globalWarning']) st.info(str(st.session_state.pdfLanguages)) st.info(st.session_state.general_text_dict['advice']) return if len(languages) == 0: languages.append(l) def removeWatermark(abstract, watermark): if watermark == "": return abstract.split("\n") tmp = abstract.split("\n") txt = "".join(tmp).replace(watermark, "") res = reSplit(txt, tmp) return res def reSplit(new, old): res = [] count = 0 for s in old: tmp = "" for c in s: if c == new[count]: tmp += c count += 1 if (tmp != ""): res.append(tmp) return res def segmentAbstract(fileName, fileAddress, tsv, author, source, year, month, day, watermark): abstract = getTextFromPDF(fileAddress) abstract = removeWatermark(abstract, watermark) nbLines = len(abstract) n = 0 count = 1 languages = {} while n < nbLines - 2: doc = "\n".join(abstract[n: n + 9]).replace("�", "").replace("", "") title = source + " : Part " + str(count) tsv += correctedSequence(author, False) + "\t" + correctedSequence( source, False) + "\t" + year + "\t" + month + "\t" + day + "\t" tsv += correctedSequence(title, False) + "\t" tsv += correctedSequence(doc, True) if tsv[-1] != "\n": tsv += "\n" n += 7 count += 1 if n > nbLines - 9 and n != nbLines - 2: n = nbLines - 9 languages = detectLanguages(doc, languages) st.session_state.pdfLanguages[fileName] = detectMultipleLanguages( languages, fileName) return tsv def correctedSequence(text, last): tmp = text.replace("\"", "\"\"") find = "\t" in text or "\"" in text or "\n" in text if find: if text[len(text) - 1] == "\n": tmp = tmp[:-1] tmp = "\"" + tmp + "\"" if last: tmp += "\n" return tmp def getInfo(): title = st.session_state.title if title == "": title = st.session_state.fileName.replace(".pdf", "") return st.session_state.author, title, st.session_state.watermark def pdfToTSV(fileName, fileAddress, pdfDir): st.session_state.page = 1 author, title, watermark = getInfo() tsv = "Authors\tSource\tPublication Year\tpublication Month\tPublication_day\tTitle\tAbstract\n" with st.spinner(st.session_state.general_text_dict['loading']): tsv = segmentAbstract(fileName, fileAddress, tsv, author, title, str(date.today().year), "1", "1", watermark) if '/' in fileName: fileName = fileName.split('/')[1] with open(pdfDir + "/" + fileName.replace(".pdf", "(pdf).tsv"), "w") as file: file.write(tsv) tsv = "\n".join(tsv.split("\n")[1:]) return tsv def extractAllPDF(zipDir, zipFile): with zipfile.ZipFile(zipFile) as zipRef: zipInfos = zipRef.infolist() for info in zipInfos: while '/' in info.filename and len(info.filename.split('/')) > 1: info.filename = "/".join(info.filename.split('/')[1:]) if ".pdf" in info.filename: zipRef.extract(info, zipDir) # Tool Code End form = st.form('api') # Page Code Start if 'page' not in st.session_state: st.session_state.page = 0 if 'submit' not in st.session_state: st.session_state.submit = False if 'warning' not in st.session_state: st.session_state.warning = "" if 'pdfLanguages' not in st.session_state: st.session_state.pdfLanguages = {} def setSubmit(): st.session_state.submit = True def resetPage(): st.session_state.page = 0 def upPage(): st.session_state.page = 2 def uploadZip(): with st.form("Submit"): st.write(st.session_state.general_text_dict['text']) st.write(st.session_state.general_text_dict['text2']) st.file_uploader( st.session_state.general_text_dict['file'], type=["zip"], key='file') st.form_submit_button( st.session_state.general_text_dict['submit'], on_click=setSubmit) def askPDF(fileName): with st.form("Submit"): st.write(fileName) st.write(st.session_state.general_text_dict['text3']) col1, col2 = st.columns(2) st.session_state.author = "" st.session_state.title = "" st.session_state.watermark = "" with col1: st.text_input( st.session_state.general_text_dict['author'], key='author') with col2: st.text_input( st.session_state.general_text_dict['titlePDF'], key='title') st.text_input( st.session_state.general_text_dict['watermark'], key='watermark') st.form_submit_button( st.session_state.general_text_dict['submit'], on_click=upPage) # Page Code End if st.session_state.page == 0: if st.session_state.submit: st.session_state.submit = False if st.session_state.file != None: st.session_state.zipDir = tempfile.TemporaryDirectory() st.session_state.pdfDir = tempfile.TemporaryDirectory() st.session_state.tsv = "Authors\tSource\tPublication Year\tpublication Month\tPublication_day\tTitle\tAbstract\n" st.session_state.page = 1 extractAllPDF(st.session_state.zipDir.name, st.session_state.file) st.session_state.len = len( os.listdir(st.session_state.zipDir.name)) st.session_state.nbDoc = 0 else: uploadZip() else: uploadZip() if st.session_state.page == 2: fileName = st.session_state.fileName tmp = pdfToTSV( fileName, st.session_state.zipDir.name + '/' + fileName, st.session_state.pdfDir.name) st.session_state.tsv += "\n" + tmp if st.session_state.nbDoc == st.session_state.len - 1: st.session_state.page = 3 else: st.session_state.nbDoc += 1 st.session_state.page = 1 if st.session_state.page == 1: fileName = os.listdir(st.session_state.zipDir.name)[st.session_state.nbDoc] st.session_state.fileName = fileName if '/' in fileName: fileName = fileName.split('/')[1] askPDF(fileName) if st.session_state.page == 3: with open(st.session_state.pdfDir.name + "/PDFCompilation.tsv", "w") as file: file.write(st.session_state.tsv) shutil.make_archive(st.session_state.zipDir.name + "/PDFCompilation", 'zip', st.session_state.pdfDir.name) with open(st.session_state.zipDir.name + "/PDFCompilation.zip", 'rb') as zip: if st.session_state.warning != "": st.info(st.session_state.warning) detectMultiplePdfLanguages() st.write(st.session_state.general_text_dict['new_file']) st.download_button("PDFCompilation.zip", zip, "PDFCompilation.zip", on_click=resetPage)