python script to extract chapters

Question

python script to extract chapters

ex-nerd opened this issue 3 years ago · comments

Do with it as you wish. I still haven't updated this for python3, but it's really handy combined with https://github.com/ex-nerd/audiotools (wow, I feel old looking at the last-updated dates on those) to merge files into m4b files that can load into a dedicated audiobook app on a phone.

#!/usr/bin/env python2
#
# Use with build_m4b from https://github.com/ex-nerd/audiotools
# Due to overdrive low quality, there is no point in encoding aac files
# with better than: 64kbps stereo, HE, optimize for voice
#

import os, sys, re
import mutagen.id3 as id3
from mutagen.mp3 import MP3
from mutagen import File

from collections import OrderedDict

def timestr(secs):
    (secs, ms) = str(secs).split('.')
    ms    =  float(ms[0:3] + '.' + ms[3:])
    secs  = int(secs)
    hours = int(secs // 3600)
    secs  = secs % 3600
    mins  = int(secs // 60)
    secs  = secs % 60
    return '{0:02}:{1:02}:{2:02}.{3:03.0f}'.format(hours, mins, secs, ms)

def load_mp3(total, dir, file):
    path = os.path.join(dir, file)
    #mfile = File(path)
    #file = File('some.mp3') # mutagen can automatically detect format and type of tags
    #artwork = file.tags['APIC:'].data # access APIC frame and grab the image
    #with open('image.jpg', 'wb') as img:
    #    img.write(artwork) # write artwork to new image
    #artwork = mfile.tags['APIC:'].data # access APIC frame and grab the image
    #with open('{0}.jpg'.format(path), 'wb') as img:
    #    img.write(artwork) # write artwork to new image
    audio = MP3(path)
    print(audio.info.length) #, audio.info.bitrate
    m = id3.ID3(path)

    data = m.get('TXXX:OverDrive MediaMarkers')
    if not data:
        print("Can't find TXXX data point for {0}".format(file))
        print(m.keys())
        return
    info = data.text[0].encode("ascii", "ignore")
    #print info
    file_chapters = re.findall(r"<Name>\s*([^>]+?)\s*</Name><Time>\s*([\d:.]+)\s*</Time>", info, re.MULTILINE)
    chapters = []
    for chapter in file_chapters:
        (name, length) = chapter
        name = re.sub(r'^"(.+)"$', r'\1', name)
        name = re.sub(r'^\*(.+)\*$', r'\1', name)
        name = re.sub(r'\s*\([^)]*\)$', '', name) # ignore any sub-chapter markers from Overdrive
        name = re.sub(r'\s+\(?continued\)?$', '', name) # ignore any sub-chapter markers from Overdrive
        name = re.sub(r'\s+-\s*$', '', name)      # ignore any sub-chapter markers from Overdrive
        name = re.sub(r'^Dis[kc]\s+\d+\W*$', '', name)  # ignore any disk markers from Overdrive
        name = name.strip()
        t_parts = list(length.split(':'))
        t_parts.reverse()
        seconds = total + float(t_parts[0])
        if len(t_parts) > 1:
            seconds += (int(t_parts[1]) * 60)
        if len(t_parts) > 2:
            seconds += (int(t_parts[2]) * 60 * 60)
        chapters.append([name, seconds])
        print(name, seconds)
        #chapters = re.search(r'(\w+)', info)
    #print(repr(chapters))
    return (total + audio.info.length, chapters)
    return


    # try:
    #     if file.decode("utf-8") == new.decode("utf-8"):
    #         new = None
    # except:
    #     print "  FILE:  "+os.path.join(dirname, file)
    #     raise
    # # Return
    # return (m, new, changed)

def visit(arg, dirname, names):
    print(dirname)
    os.chdir(dirname)
    #parent = os.path.dirname(dirname)
    #thisdir = os.path.basename(dirname)
    #print thisdir
    # Parse the files
    total = 0;
    all_chapters = OrderedDict()
    for file in sorted(names):
        if file.endswith('.mp3'):
            (total, chapters) = load_mp3(total, dirname, file)
            for chapter in chapters:
                if chapter[0] in all_chapters.keys():
                    continue
                all_chapters[chapter[0]] = chapter[1]
    if len(all_chapters) > 0:
        with open('overdrive_chapters.txt', 'w') as file:
            for name, length in all_chapters.items():
                chapstr = u'{0} {1}'.format(timestr(length), name)
                print(chapstr)
                file.write(chapstr + '\n')
    #print(repr(all_chapters))



if len(sys.argv) > 1:
    path = os.path.abspath(sys.argv[1])
else:
    path = os.path.abspath('.')
print(path)

os.path.walk(path, visit, None)

Chris Petersen · Answer 1 · Wed Oct 05 2022 14:26:05 GMT+0800 (China Standard Time)

It's apparently been a long time since I've used this script. Anyway, here's an updated version for python3:

#!/usr/bin/env python3
#
# Recursively scans current or specified directory for all subdirectories
# containing mp3 files. If these mp3 files contain overdrive chapter markers
# (id3 tag), writes overdrive_chapters.txt to the same directory.
#
# Usage:
#
# extract_overdrive_chapters.py [optional directory path]
#
# Use with build_m4b from https://github.com/ex-nerd/audiotools
#
# Note: Due to overdrive low quality, there is no point in encoding aac files
# with better than: 64kbps stereo, HE, optimize for voice
#

import os, sys, re
import mutagen.id3 as id3
from mutagen.mp3 import MP3
from mutagen import File

from collections import OrderedDict


def timestr(secs):
    (secs, ms) = str(secs).split(".")
    ms = float(ms[0:3] + "." + ms[3:])
    secs = int(secs)
    hours = int(secs // 3600)
    secs = secs % 3600
    mins = int(secs // 60)
    secs = secs % 60
    return f"{hours:02}:{mins:02}:{secs:02}.{ms:03.0f}"


def load_mp3(total, dir, file):
    path = os.path.join(dir, file)
    audio = MP3(path)
    # print(audio.info.length)  # , audio.info.bitrate
    m = id3.ID3(path)

    data = m.get("TXXX:OverDrive MediaMarkers")
    if not data:
        print("Can't find TXXX data point for {0}".format(file))
        print(m.keys())
        return
    info = data.text[0]
    file_chapters = re.findall(
        r"<Name>\s*([^>]+?)\s*</Name><Time>\s*([\d:.]+)\s*</Time>", info, re.MULTILINE
    )
    chapters = []
    for chapter in file_chapters:
        (name, length) = chapter
        name = re.sub(r'^"(.+)"$', r"\1", name)
        name = re.sub(r"^\*(.+)\*$", r"\1", name)
        name = re.sub(
            r"\s*\([^)]*\)$", "", name
        )  # ignore any sub-chapter markers from Overdrive
        name = re.sub(
            r"\s+\(?continued\)?$", "", name
        )  # ignore any sub-chapter markers from Overdrive
        name = re.sub(
            r"\s+-\s*$", "", name
        )  # ignore any sub-chapter markers from Overdrive
        name = re.sub(
            r"^Dis[kc]\s+\d+\W*$", "", name
        )  # ignore any disk markers from Overdrive
        name = name.strip()
        t_parts = list(length.split(":"))
        t_parts.reverse()
        seconds = total + float(t_parts[0])
        if len(t_parts) > 1:
            seconds += int(t_parts[1]) * 60
        if len(t_parts) > 2:
            seconds += int(t_parts[2]) * 60 * 60
        chapters.append([name, seconds])
        # print(name, seconds)
    return (total + audio.info.length, chapters)


def visit(dirname, filenames):
    print(dirname)
    os.chdir(dirname)
    # Parse the files
    total = 0
    all_chapters = OrderedDict()
    for file in sorted(filenames):
        if file.endswith(".mp3"):
            (total, chapters) = load_mp3(total, dirname, file)
            # print(repr(chapters))
            for chapter in chapters:
                if chapter[0] in all_chapters.keys():
                    continue
                all_chapters[chapter[0]] = chapter[1]
    if len(all_chapters) > 0:
        with open("overdrive_chapters.txt", "w") as file:
            for name, length in all_chapters.items():
                chapstr = f"{timestr(length)} {name}"
                print(chapstr)
                file.write(chapstr + "\n")
    # print(repr(all_chapters))


if __name__ == "__main__":

    if len(sys.argv) > 1:
        path = os.path.abspath(sys.argv[1])
    else:
        path = os.path.abspath(".")

    for dirname, dirs, files in os.walk(path, topdown=True):
        dirs[:] = [d for d in dirs if d not in {".git", ".direnv"}]
        visit(dirname, files)

I guess I should update my audiotools scripts for python3, too.

choc96208 · Answer 2 · Mon Oct 10 2022 09:25:24 GMT+0800 (China Standard Time)

Hi @ex-nerd, I added this script to audiobook_chapters. Hope you don't mind. I credited you in the references. I then use another script to create a FFMETADATAFILE.

Chris Petersen · Answer 3 · Mon Oct 10 2022 13:50:33 GMT+0800 (China Standard Time)

@choc96208 Sure thing. It's not like me to leave a license off of my code. Consider it MIT (I'll update the comments and https://github.com/ex-nerd/audiotools accordingly)

Christopher Brown · Answer 4 · Mon Dec 12 2022 10:43:55 GMT+0800 (China Standard Time)

Cool, but out of scope, sorry.

bender · Answer 5 · Thu Jan 26 2023 04:59:18 GMT+0800 (China Standard Time)

@ex-nerd slightly dumb question here but I'm still fairly new to all this -- how would I specify a particular directory to run this in?

Chris Petersen · Answer 6 · Thu Jan 26 2023 05:02:59 GMT+0800 (China Standard Time)

@LeLawnGames just the first parameter, e.g. ./extract_overdrive_chapters.py /path/to/directory Or leave off the path and by default it will run in the current directory.

bender · Answer 7 · Thu Jan 26 2023 05:15:11 GMT+0800 (China Standard Time)

oh gotcha -- thank you!