I am training a deep learning model on a dataset of 100GB of video data. I am trying to convert it all into frames before applying Haar Cascade on each frame to crop the faces out. But im trying to find the fastest way to do so.
NOTE: There are 5 folds each with 2 parts in them. Each part has 6 folders with 3 different categories of data(0, 5, 10) in different video formats(mp4, MOV, mov). There are 144 video files in total and each is about 700MB in size.
EDIT: I am running into an issue where the processing of the video to frames is not waiting for its work to be done before going on to the next file
Error Message: It immediately throws an error for each file instead of waiting to convert the frames of the current file then moving on to the next file.
Error!!!
Reading from /Volumes/HDD/Data/Fold4_part2/44/5.mov
Category:5
Writing to /Volumes/HDD/Data/Fold4_part2/44
Number of frames:  7353
Converting video..
Error!!!
Reading from /Volumes/HDD/Data/Fold4_part2/45/0.mp4
Category:0
Writing to /Volumes/HDD/Data/Fold4_part2/45
Number of frames:  7716
Converting video..
Code:
import cv2
import time 
import os
path_HDD = "/Volumes/HDD/Data"
def files(path):
    """
    Function to get the files and add them to a list 
    Args: 
        path: path of the file 
    Not sure what is DS_Store but I do not need it
    """
    for root, directories, files in os.walk(path, topdown=False):
        for name in files:
            file_path = os.path.join(root, name)
            if (name == ".DS_Store"):
                continue 
            else: 
                category = name.split(".")[0]
                # Category returns the video category 
                try: 
                    print("Reading from " + file_path)
                    print("Category:" + category)
                    print("Writing to " + root)
                    video_to_frames(file_path, category, root)
                except:
                    print("Error!!!")
 
        
def video_to_frames(input_loc, label,output_loc):
    """Function to extract frames from input video file
    and save them as separate frames in an output directory.
    Args:
        input_loc: Input video file.
        output_loc: Output directory to save the frames.
    Returns:
        None
    """
    # Log the time
    time_start = time.time()
    # Start capturing the feed
    cap = cv2.VideoCapture(input_loc)
    # Find the number of frames
    video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - 1
    print ("Number of frames: ", video_length)
    count = 0
    print ("Converting video..\n")
    # Start converting the video
    while cap.isOpened():
        # Extract the frame
        ret, frame = cap.read()
        # Write the results back to output location.
        cv2.imwrite(output_loc +  "/" + label + "/%#05d.jpg" % (count+1), frame)
        
        count = count + 1
        # If there are no more frames left
        if (count > (video_length-1)):
            # Log the time again
            time_end = time.time()
            # Release the feed
            cap.release()
            # Print stats
            print ("Done extracting frames.\n%d frames extracted" % count)
            print ("It took %d seconds forconversion." % (time_end-time_start))
            break
if __name__=="__main__":
    files(path_HDD)
 
    