Emacs Lisp and NodeJS: Getting the bolded words from a section of a Google Document
| french, js, emacsDuring the sessions with my French tutor, I share a Google document so that we can mark the words where I need to practice my pronunciation some more or tweak the wording. Using Ctrl+B to make the word as bold is an easy way to make it jump out.
I used to copy these changes into my Org Mode notes manually, but today I thought I'd try automating some of it.
First, I need a script to download the HTML for a specified Google document. This is probably easier to do with the NodeJS library rather than with oauth2.el and url-retrieve-synchronously because of various authentication things.
require('dotenv').config();
const { google } = require('googleapis');
async function download(fileId) {
const auth = new google.auth.GoogleAuth({
scopes: ['https://www.googleapis.com/auth/drive.readonly'],
});
const drive = google.drive({ version: 'v3', auth });
const htmlRes = await drive.files.export({
fileId: fileId,
mimeType: 'text/html'
});
return htmlRes.data;
}
async function main() {
console.log(await download(process.argv.length > 2 ? process.argv[2] : process.env['DOC_ID']));
}
main();
Then I can wrap a little bit of Emacs Lisp around it.
(defvar my-google-doc-download-command
(list "nodejs" (expand-file-name "~/bin/download-google-doc-html.cjs")))
(defun my-google-doc-html (doc-id)
(when (string-match "https://docs\\.google\\.com/document/d/\\(.+?\\)/" doc-id)
(setq doc-id (match-string 1 doc-id)))
(with-temp-buffer
(apply #'call-process (car my-google-doc-download-command)
nil t nil (append (cdr my-google-doc-download-command) (list doc-id)))
(buffer-string)))
I have lots of sections in that document, including past journal entries, so I want to get a specific section by name.
(defun my-html-get-section (dom section-name)
"Return DOM elements for SECTION-NAME."
;; Find the section heading (h1 ... h4) where the text equals section-name
;; Collect all the siblings until the next heading of equal or higher level
(let*
((matching (dom-search dom (lambda (o)
(and (string-match "h[1-6]" (symbol-name (dom-tag o)))
(string= (string-trim (dom-texts o " ")) section-name)))))
(parent (and matching (dom-parent dom (car matching))))
level
results)
(catch 'done
(dolist (o (dom-children parent))
(cond
((and (string-match "h[1-6]" (symbol-name (dom-tag o)))
(string= (string-trim (dom-texts o)) section-name))
(setq level (symbol-name (dom-tag o))))
(level
(if (and (string-match "h[1-6]" (symbol-name (dom-tag o)))
(not (string< level (symbol-name (dom-tag o)))))
(throw 'done (nreverse results))
(push o results)))
;; Ignore before the matching heading
))
results)))
Now I can get the bolded words from a section of my notes, with just a sentence for context. I use pandoc to convert it to Org Mode syntax.
(defvar my-lang-words-for-review-context-function 'sentence-at-point)
(defun my-lang-tutor-notes (section-name)
(let* ((my-lang-tutor-notes (my-google-doc-html my-lang-tutor-notes-url))
(dom (with-temp-buffer
(insert my-lang-tutor-notes)
(libxml-parse-html-region))))
(append (list 'div nil)
(my-html-get-section dom section-name))))
(defun my-lang-words-for-review (section)
"List the bolded words for review in SECTION."
(let* ((section (my-lang-tutor-notes section))
results)
(mapc
(lambda (o)
(with-temp-buffer
(insert
(pandoc-convert-stdio
(with-temp-buffer
(svg-print (dom-parent section o))
(buffer-string))
"html"
"org"))
(org-mode)
(goto-char (point-min))
(while (re-search-forward "\\*.+?\\*" nil t)
(cl-pushnew
(replace-regexp-in-string
"\n" " "
(funcall my-lang-words-for-review-context-function))
results
:test 'string=))))
(dom-search
section
(lambda (o)
(when
(and
(string-match "font-weight:700" (or (dom-attr o 'style) ""))
(not (string-match "font-style:normal" (or (dom-attr o 'style) ""))))
(setf (car o) 'strong)
t))))
(nreverse results)))
For example, when I run it on my notes on artificial intelligence, this is the list of bolded words and the sentences that contain them.
(my-lang-words-for-review "Sur l'intelligence artificielle")
I can then go into the WhisperX transcription JSON file and replay those parts for closer review.
I also can tweak the context function to give me less information. For example, to limit it to the containing phrase, I can do this:
(defun my-split-string-keep-delimiters (string delimiter)
(when string
(let (results pos)
(with-temp-buffer
(insert string)
(goto-char (point-min))
(setq pos (point-min))
(while (re-search-forward delimiter nil t)
(push (buffer-substring pos (match-beginning 0)) results)
(setq pos (match-beginning 0)))
(push (buffer-substring pos (point-max)) results)
(nreverse results)))))
(ert-deftest my-split-string-keep-delimiters ()
(should
(equal (my-split-string-keep-delimiters
"Beaucoup de gens ont une réaction forte contre l'IA pour plusieurs raisons qui *incluent* le battage médiatique excessif dont elle fait l'objet, son utilisation à mauvais escient, et *l'inondation de banalité* qu'elle produit."
", \\| que \\| qui \\| qu'ils? \\| qu'elles? \\| qu'on "
)
)))
(defun my-lang-words-for-review-phrase-context (&optional s)
(setq s (replace-regexp-in-string " " " " (or s (sentence-at-point))))
(string-join
(seq-filter (lambda (s) (string-match "\\*" s))
(my-split-string-keep-delimiters s ", \\| parce que \\| que \\| qui \\| qu'ils? \\| qu'elles? \\| qu'on \\| pour "))
" ... "))
(ert-deftest my-lang-words-for-review-phrase-context ()
(should
(equal (my-lang-words-for-review-phrase-context
"Je peux consacrer une petite partie de mon *budget* à des essais, mais je ne veux pas travailler davantage pour rentabiliser une dépense plus importante.")
"Je peux consacrer une petite partie de mon *budget* à des essais")))
(let ((my-lang-words-for-review-context-function 'my-lang-words-for-review-phrase-context))
(my-lang-words-for-review "Sur l'intelligence artificielle"))
Now that I have a function for retrieving the HTML or Org Mode for a section, I can use that to wdiff against my current text to more easily spot wording changes.
(defun my-lang-tutor-notes-wdiff-org ()
(interactive)
(let ((section (org-entry-get (point) "ITEM")))
(my-wdiff-strings
(replace-regexp-in-string
" " " "
(my-org-subtree-text-without-blocks))
(replace-regexp-in-string
" " " "
(pandoc-convert-stdio
(with-temp-buffer
(svg-print
(my-lang-tutor-notes section))
(buffer-string))
"html"
"org")))))
Related:
my-wdiff-stringsis in Wdiffmy-org-subtree-text-without-blocksis in Counting words without blocks
Screenshot: