Using word-level timing information when editing subtitles or captions in Emacs

I like to split captions at logical points, such as at the end of a phrase or sentence. At first, I used subed.el to play the video for the caption, pausing it at the appropriate point and then calling subed-split-subtitle to split at the playback position. Then I modified subed-split-subtitle to split at the video position that’s proportional to the text position, so that it’s roughly in the right spot even if I’m not currently listening. That got me most of the way to being able to quickly edit subtitles.

It turns out that word-level timing is actually available from YouTube if I download the autogenerated SRV2 file using youtube-dl, which I can do with the following function:

(defun my/caption-download-srv2 (id)
  (interactive "MID: ")
  (when (string-match "v=\\([^&]+\\)" id) (setq id (match-string 1 id)))
  (call-process "youtube-dl" nil nil nil "--write-auto-sub" "--sub-lang" "en" "--skip-download" "--sub-format" "srv2"
                (concat "" id))
  (my/caption-load-word-data (my/latest-file "." "\\.srv2\\'")))

I started parsing JSON files, but SRV2 seemed to be more reliably avaliable, so here are the parsing functions for both. I also change common recognition errors along the way, using the my/subed-common-edits variable defined in my config for subtitles. To change those ones in the VTT file I’m editing, I use my/subed-fix-common-errors, also defined elsewhere.

(defvar-local my/caption-cache nil "Word-level timing in the form ((start . ms) (end . ms) (text . ms))")
(defun my/caption-json-time-to-ms (json)
  (+ (* 1000 (string-to-number (alist-get 'seconds json)))
     (/ (alist-get 'nanos json) 1000000)))

(defun my/caption-extract-words-from-json3 ()
  (let* ((data (progn (goto-char (point-min)) (json-read)))
         (json3-p (alist-get 'events data))
         (reversed (reverse
                    (or (alist-get 'events data)
                        (cl-loop for seg in (car (alist-get 'results data))
                                 nconc (alist-get 'words (car (alist-get 'alternatives seg)))))))
         (last-event (seq-first reversed))
         (last-ms (if json3-p
                      (+ (alist-get 'tStartMs last-event)
                         (alist-get 'dDurationMs last-event)))))
     (cl-loop for e across reversed append
              (if json3-p
                   (lambda (seg)
                     (let ((rec
                            `((start ,(+ (alist-get 'tStartMs e)
                                         (or (alist-get 'tOffsetMs seg) 0)))
                              (end ,(min last-ms
                                         (+ (alist-get 'tStartMs e)
                                            (or (alist-get 'dDurationMs e) 0))))
                              (text ,(alist-get 'utf8 seg)))))
                       (setq last-ms (alist-get 'start rec))
                   (reverse (alist-get 'segs e)))
                `((start ,(my/caption-json-time-to-ms (alist-get 'startTime seg)))
                  (end ,(my/caption-json-time-to-ms (alist-get 'endTime seg)))
                  (text ,(alist-get 'word seg))))))))

(defun my/caption-extract-words-from-srv2 ()
  (let* ((data (xml-parse-region))
         (text-elements (reverse (dom-by-tag data 'text)))
         (last-start (+ (string-to-number
                         (alist-get 't (xml-node-attributes (car text-elements))))
                        (string-to-number (alist-get 'd (xml-node-attributes (car text-elements)))))))
     (mapcar #'(lambda (element)
                 (let ((rec (list (cons 'start (string-to-number (alist-get 't (xml-node-attributes element))))
                                  (cons 'end last-start)
                                  (cons 'text (car (xml-node-children element))))))
                   (setq last-start (alist-get 'start rec))

(defun my/caption-fix-common-errors (data)
  (mapc (lambda (o)
          (mapc (lambda (e)
                  (when (string-match (concat "\\<" (car e) "\\>") (alist-get 'text o))
                    (map-put! o 'text (replace-match (cadr e) t t (alist-get 'text o)))))

(defun my/caption-load-word-data (file)
  "Load word-level timing from FILE."
  (interactive "fFile: ")
  (let (data)
    (with-current-buffer (find-file-noselect file)
       ((string-match "\\.json" file)
        (setq data (my/caption-extract-words-from-json3)))
       ((string-match "\\.srv2\\'" file)
        (setq data (my/caption-extract-words-from-srv2)))
       (t (error "Unknown format."))))
    (setq-local my/caption-cache
                (mapcar (lambda (entry)
                          (setf (alist-get 'text entry)
                                (replace-regexp-in-string "&#39;" "'" (alist-get 'text entry)))
                        (my/caption-fix-common-errors data)))))

Assuming I start editing from the beginning of the file, then the part of the captions file after point is mostly unedited. That means I can match the remainder of the current caption with the word-level timing to try to figure out the time to use when splitting the subtitle, falling back to the proportional method if the data is not available.

(defun my/caption-look-up-word ()
    (let* ((end (subed-subtitle-msecs-stop))
           (start (subed-subtitle-msecs-start))
           (remaining-words (split-string (buffer-substring (point) (subed-jump-to-subtitle-end))))
           (words (reverse (seq-filter (lambda (o)
                                         (and (<= (alist-get 'end o) end)
                                              (>= (alist-get 'start o) start)
                                              (not (string-match "^\n*$" (alist-get 'text o)))))
           (offset 0)
           candidate done)
      (while (not done)
        (setq candidate (elt words (+ (1- (length remaining-words)) offset)))
         ((and candidate (string-match (concat "\\<" (car remaining-words) "\\>") (alist-get 'text candidate)))
          (setq done t))
         ((> offset (length words)) (setq done t))
         ((> offset 0) (setq offset (- offset)))
         (t (setq offset (1+ (- offset))))))

(defun my/caption-unwrap ()
  (let ((limit (save-excursion (or (subed-jump-to-subtitle-end) (point)))))
         (while (re-search-forward "\n" limit t)
           (replace-match " "))))
(defun my/caption-split ()
  "Split the current subtitle based on word-level timing if available."
    (let ((data (my/caption-look-up-word)))
      (prin1 data)
      (subed-split-subtitle (and data (- (alist-get 'start data) (subed-subtitle-msecs-start)))))))
(defun my/caption-split-and-merge-with-next ()
(defun my/caption-split-and-merge-with-previous ()
(use-package subed
  :if my/laptop-p
  :load-path "~/vendor/subed/subed"
  (:map subed-mode-map
        ("M-'" . my/caption-split)
        ("M-," . my/caption-split-and-merge-with-previous)
        ("M-q" . my/caption-unwrap)
        ("M-." . my/caption-split-and-merge-with-next)))

That way, I can use the word-level timing information for most of the reformatting, but I can easily replay segments of the video if I’m unsure about a word that needs to be changed.

If I want to generate a VTT based on the caption data, breaking it at certain words, these functions help:

(defvar my/caption-breaks
  '("the" "this" "we" "we're" "I" "finally" "but" "and" "when")
  "List of words to try to break at.")
(defun my/caption-make-groups (list)
  (let (result
        (current-length 0)
        (limit 70)
        (lower-limit 30)
        (break-regexp (concat "\\<" (regexp-opt my/caption-breaks) "\\>")))
    (while list
       ((null (car list)))
       ((string-match "^\n*$" (alist-get 'text (car list)))
        (push (cons '(text . " ") (car list)) current-item)
        (setq current-length (1+ current-length)))
       ((< (+ current-length (length (alist-get 'text (car list)))) limit)
        (setq current-item (cons (car list) current-item)
              current-length (+ current-length (length (alist-get 'text (car list))) 1)))
       (t (setq done nil)
          (while (not done)
           ((< current-length lower-limit)
            (setq done t))
           ((and (string-match break-regexp (alist-get 'text (car current-item)))
                 (not (string-match break-regexp (alist-get 'text (cadr current-item)))))
            (setq current-length (- current-length (length (alist-get 'text (car current-item)))))
            (push (pop current-item) list)
            (setq done t))
            (setq current-length (- current-length (length (alist-get 'text (car current-item)))))
            (push (pop current-item) list))))
          (push nil list)
          (setq result (cons (reverse current-item) result) current-item nil current-length 0)))
      (setq list (cdr list)))
    (reverse result)))

(defun my/caption-format-as-subtitle (list &optional word-timing)
  "Turn a LIST of the form (((start . ms) (end . ms) (text . s)) ...) into VTT.
If WORD-TIMING is non-nil, include word-level timestamps."
  (format "%s --> %s\n%s\n\n"
          (subed-vtt--msecs-to-timestamp (alist-get 'start (car list)))
          (subed-vtt--msecs-to-timestamp (alist-get 'end (car (last list))))
          (s-trim (mapconcat (lambda (entry)
                               (if word-timing
                                   (format " <%s>%s"
                                           (subed-vtt--msecs-to-timestamp (alist-get 'start entry))
                                           (string-trim (alist-get 'text entry)))
                                 (alist-get 'text entry)))
                             list ""))))

(defun my/caption-to-vtt (&optional data)
  (with-temp-file "captions.vtt"
    (insert "WEBVTT\n\n"
             (lambda (entry) (my/caption-format-as-subtitle entry))
              (or data (my/caption-fix-common-errors my/caption-cache)))