Commit a396f597 authored by Nicolas Atrax's avatar Nicolas Atrax

Update YTBtoTSV.py

parent 281865d7
...@@ -13,11 +13,16 @@ def ytbSearch(search, n): ...@@ -13,11 +13,16 @@ def ytbSearch(search, n):
videosSearch = VideosSearch(search, limit=n) videosSearch = VideosSearch(search, limit=n)
result = videosSearch.result()["result"] result = videosSearch.result()["result"]
videos = [] videos = []
for video in result: while len(videos) < n:
id = video["id"] for video in result:
title = video["title"] id = video["id"]
author = video["channel"]["name"] title = video["title"]
videos.append([id, author, title]) author = video["channel"]["name"]
videos.append([id, author, title])
if len(videos) == n:
break
search.next()
result = search.result()["result"]
return videos return videos
...@@ -27,7 +32,7 @@ def getLang(list): ...@@ -27,7 +32,7 @@ def getLang(list):
def translatedTranscript(lang, lst, title, manual): def translatedTranscript(lang, lst, title, manual):
if lang != "fr" and lang != "en": if lang != "en":
res = lst.find_transcript([lang]) res = lst.find_transcript([lang])
trans = res.translate("en").fetch() trans = res.translate("en").fetch()
print("Subtitles are translated from " + print("Subtitles are translated from " +
...@@ -59,8 +64,8 @@ def ytbTranscript(id, title): ...@@ -59,8 +64,8 @@ def ytbTranscript(id, title):
def tsvAdd(tsv, abstract, author, title, date, count): def tsvAdd(tsv, abstract, author, title, date, count):
part = title + " : Part " + str(count) part = title + " : Part " + str(count)
tsv += correctedSequence(author, False) + "\t" + correctedSequence( tsv += correctedSequence(author, False) + "\t" + correctedSequence(
part, False) + "\t" + date + "\t" + "1" + "\t" + "1" + "\t" title, False) + "\t" + date + "\t" + "1" + "\t" + "1" + "\t"
tsv += correctedSequence(title, False) + "\t" tsv += correctedSequence(part, False) + "\t"
tsv += correctedSequence(abstract, True) tsv += correctedSequence(abstract, True)
return tsv return tsv
...@@ -71,7 +76,7 @@ def transcriptAutomaticToDoc(transcript, author, title, date): ...@@ -71,7 +76,7 @@ def transcriptAutomaticToDoc(transcript, author, title, date):
count = 1 count = 1
tmp = "" tmp = ""
for part in transcript: for part in transcript:
tmp += part["text"] + " " tmp += part["text"] + "\n"
time += int(part["duration"]) time += int(part["duration"])
if time >= 20: if time >= 20:
tsv = tsvAdd(tsv, tmp, author, title, date, count) tsv = tsvAdd(tsv, tmp, author, title, date, count)
...@@ -117,7 +122,7 @@ def transcriptManualToDoc(transcript, author, title, date): ...@@ -117,7 +122,7 @@ def transcriptManualToDoc(transcript, author, title, date):
tmp += text tmp += text
time += int(part["duration"]) time += int(part["duration"])
else: else:
tmp += text + " " tmp += text + "\n"
time += int(part["duration"]) time += int(part["duration"])
with open("tsv/" + title + ".tsv", "w", encoding='utf-8-sig') as file: with open("tsv/" + title + ".tsv", "w", encoding='utf-8-sig') as file:
file.write(tsv) file.write(tsv)
...@@ -139,7 +144,7 @@ def correctedSequence(text, last): ...@@ -139,7 +144,7 @@ def correctedSequence(text, last):
def transcriptToTsv(search, nbVideos): def transcriptToTsv(search, nbVideos):
tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract\n" tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract\n"
videos = ytbSearch(search, nbVideos) videos = ytbSearch(search, nbVideos * 4)
for video in videos: for video in videos:
id, author, title = video[0], video[1], video[2] id, author, title = video[0], video[1], video[2]
print("Processing conversion for video : " + title) print("Processing conversion for video : " + title)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment