Update YTBtoTSV.py

a396f597 · Nicolas Atrax · 281865d7 · a396f597
Commit a396f597 authored Aug 31, 2023 by Nicolas Atrax
Hide whitespace changes
Inline Side-by-side

Showing with 16 additions and 11 deletions

YTBtoTSV.py Conversion/ToTSV/YTBtoTSV/YTBtoTSV.py +16 -11

No files found.
--- a/Conversion/ToTSV/YTBtoTSV/YTBtoTSV.py
+++ b/Conversion/ToTSV/YTBtoTSV/YTBtoTSV.py
@@ -13,11 +13,16 @@ def ytbSearch(search, n):
    videosSearch = VideosSearch(search, limit=n)
    result = videosSearch.result()["result"]
    videos = []
-    for video in result:
+    while len(videos) < n:
-        id = video["id"]
+        for video in result:
-        title = video["title"]
+            id = video["id"]
-        author = video["channel"]["name"]
+            title = video["title"]
-        videos.append([id, author, title])
+            author = video["channel"]["name"]
+            videos.append([id, author, title])
+            if len(videos) == n:
+                break
+        search.next()
+        result = search.result()["result"]
    return videos
@@ -27,7 +32,7 @@ def getLang(list):
 def translatedTranscript(lang, lst, title, manual):
-    if lang != "fr" and lang != "en":
+    if lang != "en":
        res = lst.find_transcript([lang])
        trans = res.translate("en").fetch()
        print("Subtitles are translated from " +
@@ -59,8 +64,8 @@ def ytbTranscript(id, title):
 def tsvAdd(tsv, abstract, author, title, date, count):
    part = title + " : Part " + str(count)
    tsv += correctedSequence(author, False) + "\t" + correctedSequence(
-        part, False) + "\t" + date + "\t" + "1" + "\t" + "1" + "\t"
+        title, False) + "\t" + date + "\t" + "1" + "\t" + "1" + "\t"
-    tsv += correctedSequence(title, False) + "\t"
+    tsv += correctedSequence(part, False) + "\t"
    tsv += correctedSequence(abstract, True)
    return tsv
@@ -71,7 +76,7 @@ def transcriptAutomaticToDoc(transcript, author, title, date):
    count = 1
    tmp = ""
    for part in transcript:
-        tmp += part["text"] + " "
+        tmp += part["text"] + "\n"
        time += int(part["duration"])
        if time >= 20:
            tsv = tsvAdd(tsv, tmp, author, title, date, count)
@@ -117,7 +122,7 @@ def transcriptManualToDoc(transcript, author, title, date):
                tmp += text
                time += int(part["duration"])
        else:
-            tmp += text + " "
+            tmp += text + "\n"
            time += int(part["duration"])
    with open("tsv/" + title + ".tsv", "w", encoding='utf-8-sig') as file:
        file.write(tsv)
@@ -139,7 +144,7 @@ def correctedSequence(text, last):
 def transcriptToTsv(search, nbVideos):
    tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract\n"
-    videos = ytbSearch(search, nbVideos)
+    videos = ytbSearch(search, nbVideos * 4)
    for video in videos:
        id, author, title = video[0], video[1], video[2]
        print("Processing conversion for video : " + title)