continuing = args.continue_from != None
for i in range(args.size):
url = url_fetcher.next()[0]
if continuing:
if url != args.continue_from:
continue
else:
continuing = False
if not url:
cp.print_warning("there are no more urls to process")
cp.print_progress(i + 1, " / ", args.size, " - ", url)
try:
video = PyTube(utils.get_youtube_url(url))
except Exception as exception:
cp.print_error("failed to generate PyTube representation for vidoe ", url)
continue
if int(video.length) > args.video_length:
continue
caption = video.captions.get_by_language_code("en")
if not caption:
cp.print_warning("no caption available for video - ", url)
continue
try:
srt_captions = caption.generate_srt_captions().split("\n\n")
except Exception as exception:
cp.print_error("failed to retrieve for vidoe - ", url)
continue
translator = str.maketrans("", "", string.punctuation) // to remove punctuation
srt_tag_re = re.compile(r"<.*?>|\(.*?\)|\[.*?\]")
keyword_exist = False
for captions in srt_captions:
if keyword in captions or plural.plural(keyword) in captions:
keyword_exist = True
break
if not keyword_exist:
cp.print_warning("keywords never appear in the video - ", url)
continue
try:
crawler = YoutubeCrawler(url)
audio_data = crawler.get_audio()
except Exception as exception:
cp.print_warning(exception)
continue
collected_data = []
video_cc_count = 0
video_audio_count = 0
for captions in srt_captions:
cc_split = captions.split("\n")
if len(cc_split) == 4 and cc_split[0] == "":
cc_split = (cc_split[1], cc_split[2], cc_split[3])
elif len(cc_split) != 3:
After Change
url_set = set()
for i in range(args.size):
url = url_fetcher.next()
if not url:
cp.print_warning("there are no more urls to process")
break
url = url[0]
if continuing:
if url != args.continue_from:
continue
else:
continuing = False
cp.print_progress(i + 1, " / ", args.size, " - ", url)
if url in url_set:
cp.print_warning("video is already processed", url)
continue
url_set.add(url)
if continuing:
if url != args.continue_from:
continue
else:
continuing = False
try:
video = PyTube(utils.get_youtube_url(url))
except Exception as exception:
cp.print_error("failed to generate PyTube representation for vidoe ", url)
continue
if int(video.length) > args.video_length:
continue
caption = video.captions.get_by_language_code("en")
if not caption:
cp.print_warning("no caption available for video - ", url)
continue
try:
srt_captions = caption.generate_srt_captions().split("\n\n")
except Exception as exception:
cp.print_error("failed to retrieve for vidoe - ", url)
continue
translator = str.maketrans("", "", string.punctuation) // to remove punctuation
srt_tag_re = re.compile(r"<.*?>|\(.*?\)|\[.*?\]")
keyword_exist = False
for captions in srt_captions:
if keyword in captions or plural.plural(keyword) in captions:
keyword_exist = True
break
if not keyword_exist:
cp.print_warning("keywords never appear in the video - ", url)
continue
try:
crawler = YoutubeCrawler(url)
audio_data = crawler.get_audio()
except Exception as exception:
cp.print_warning(exception)
continue
collected_data = []
video_cc_count = 0
video_audio_count = 0
for captions in srt_captions:
cc_split = captions.split("\n")
if len(cc_split) == 4 and cc_split[0] == "":
cc_split = (cc_split[1], cc_split[2], cc_split[3])
elif len(cc_split) != 3: