bash -lc cat > /mnt/data/gist-comment-parser/src/parse-gist-comments.ts <<'TS'
import { createHash } from "node:crypto";
import { readFile, writeFile } from "node:fs/promises";
import { basename } from "node:path";
import process from "node:process";
import * as cheerio from "cheerio";
import TurndownService from "turndown";
import { gfm } from "turndown-plugin-gfm";
import type { Cheerio, CheerioAPI } from "cheerio";
import type { Element } from "domhandler";
/** A Cheerio selection containing HTML elements from the parsed document. */
type HtmlSelection = Cheerio<Element>;
/** A nullable string used when the page does not contain the requested value. */
type MaybeString = string | null;
/** A nullable number used when the page does not contain the requested value. */
type MaybeNumber = number | null;
/** Runtime assertion helpers; the real project may replace these with richer errors. */
interface AssertFunction {
(value: unknown, message: string): asserts value;
is: (a: unknown, b: unknown, message: string) => void;
nis: (a: unknown, b: unknown, message: string) => void;
eq: (a: unknown, b: unknown, message: string) => void;
neq: (a: unknown, b: unknown, message: string) => void;
lt: (a: number, b: number, message: string) => void;
lte: (a: number, b: number, message: string) => void;
gt: (a: number, b: number, message: string) => void;
gte: (a: number, b: number, message: string) => void;
}
const A: AssertFunction = ((value: unknown, message: string): asserts value => {
if (!value) {
throw new Error(message);
}
}) as AssertFunction;
A.is = (a: unknown, b: unknown, message: string): void => { if (!Object.is(a, b)) { throw new Error(message); } };
A.nis = (a: unknown, b: unknown, message: string): void => { if ( Object.is(a, b)) { throw new Error(message); } };
A.eq = (a: unknown, b: unknown, message: string): void => { if (a !== b) { throw new Error(message); } };
A.neq = (a: unknown, b: unknown, message: string): void => { if (a === b) { throw new Error(message); } };
A.lt = (a: number, b: number, message: string): void => { if (!(a < b)) { throw new Error(message); } };
A.lte = (a: number, b: number, message: string): void => { if (!(a <= b)) { throw new Error(message); } };
A.gt = (a: number, b: number, message: string): void => { if (!(a > b)) { throw new Error(message); } };
A.gte = (a: number, b: number, message: string): void => { if (!(a >= b)) { throw new Error(message); } };
const DATA_URI_PLACEHOLDER_PREFIX = "data:singlefile-omitted,";
const GITHUB_GIST_ORIGIN = "https://gist.github.com";
/** Options chosen at the command line and applied to every parsed comment. */
interface ParserOptions {
include_body_html: boolean;
include_data_uris: boolean;
include_outer_html: boolean;
}
/** Parsed command-line arguments after flags have been separated from paths. */
interface CliArgs extends ParserOptions {
input_path: string;
output_path: MaybeString;
}
/** Counts produced while shrinking SingleFile HTML into content-bearing HTML. */
interface HtmlPreprocessStats {
original_chars: number;
preprocessed_chars: number;
removed_style_blocks: number;
removed_script_blocks: number;
removed_template_blocks: number;
removed_svg_blocks: number;
compacted_details_menu_blocks: number;
compacted_include_fragments: number;
replaced_data_uri_attributes: number;
unique_data_uris: number;
}
/** A reduced HTML document plus the data URI lookup table needed to restore or summarize embedded data. */
interface PreprocessedHtml {
html: string;
data_uris: DataUriStore;
stats: HtmlPreprocessStats;
}
/** Metadata inferred once from the SingleFile page and repeated in every JSONL record. */
interface PageMetadata {
source_url: MaybeString;
saved_at: MaybeString;
gist_owner: MaybeString;
gist_id: MaybeString;
gist_url: MaybeString;
title: MaybeString;
input_filename: MaybeString;
}
/** Compact metadata about a SingleFile-embedded data URI. */
interface EmbeddedDataUriSummary {
token: string;
placeholder_url: string;
media_type: string;
is_base64: boolean;
payload_sha256: string;
payload_bytes: MaybeNumber;
data_uri_chars: number;
decode_error: MaybeString;
}
/** A stored data URI and its compact summary. */
interface StoredDataUri {
token: string;
placeholder_url: string;
data_uri: string;
summary: EmbeddedDataUriSummary;
}
/** Avatar data visible near a gist comment. */
interface AvatarInfo {
alt: MaybeString;
width: MaybeNumber;
height: MaybeNumber;
src: MaybeString;
original_src: MaybeString;
embedded_data_uri: EmbeddedDataUriSummary | null;
}
/** A hyperlink found inside a rendered Markdown comment body. */
interface LinkInfo {
index: number;
text: string;
href: MaybeString;
original_href: MaybeString;
title: MaybeString;
rel: MaybeString;
target: MaybeString;
aria_label: MaybeString;
classes: string[];
embedded_data_uri: EmbeddedDataUriSummary | null;
}
/** An image found inside a rendered Markdown comment body. */
interface ImageInfo {
index: number;
alt: MaybeString;
title: MaybeString;
width: MaybeNumber;
height: MaybeNumber;
src: MaybeString;
original_src: MaybeString;
classes: string[];
embedded_data_uri: EmbeddedDataUriSummary | null;
}
/** A checkbox task item recovered from the rendered Markdown body. */
interface TaskListItemInfo {
index: number;
checked: boolean;
text: string;
}
/** A fenced or indented code block recovered from the rendered Markdown body. */
interface CodeBlockInfo {
index: number;
language: MaybeString;
code: string;
classes: string[];
}
/** A GitHub reaction summary, if SingleFile captured reaction markup for a comment. */
interface ReactionInfo {
index: number;
label: MaybeString;
emoji: MaybeString;
count: MaybeNumber;
reacted_by_current_user: boolean;
classes: string[];
}
/** Edit-related controls and URLs visible in the comment chrome. */
interface CommentEditInfo {
is_edited: boolean;
update_form_action: MaybeString;
update_form_method: MaybeString;
preview_url: MaybeString;
comment_actions_menu_url: MaybeString;
edit_history_url: MaybeString;
}
/** The first pass keeps page-shaped values close to where they appeared in the DOM. */
interface RawGistComment {
index: number;
container_classes: string[];
comment_group_id: string;
comment_group_classes: string[];
comment_id: string;
author_login: MaybeString;
author_url: MaybeString;
author_link_text: MaybeString;
author_hovercard_url: MaybeString;
author_badges: string[];
avatar: AvatarInfo | null;
permalink_url: MaybeString;
permalink_id: MaybeString;
permalink_fragment: MaybeString;
created_at: MaybeString;
created_at_title: MaybeString;
created_at_text: MaybeString;
created_at_ms: MaybeNumber;
relative_time_template: MaybeString;
edit: CommentEditInfo;
body_html: string;
body_text: string;
links: LinkInfo[];
images: ImageInfo[];
task_list_items: TaskListItemInfo[];
code_blocks: CodeBlockInfo[];
reactions: ReactionInfo[];
outer_html: MaybeString;
}
/** The final JSONL schema exposes normalized values and Markdown output. */
interface GistCommentJson {
record_type: "gist_comment";
page: PageMetadata;
index: number;
comment_id: string;
comment_anchor: string;
permalink_url: MaybeString;
permalink_fragment: MaybeString;
author: {
login: MaybeString;
url: MaybeString;
link_text: MaybeString;
hovercard_url: MaybeString;
badges: string[];
};
avatar: AvatarInfo | null;
created_at: MaybeString;
created_at_title: MaybeString;
created_at_text: MaybeString;
created_at_ms: MaybeNumber;
relative_time_template: MaybeString;
is_current_user: boolean;
is_author: boolean;
is_edited: boolean;
is_minimized: boolean;
container_classes: string[];
comment_group_classes: string[];
edit: CommentEditInfo;
edit_form_action: MaybeString;
preview_url: MaybeString;
body_markdown: string;
body_text: string;
body_html?: string;
links: LinkInfo[];
images: ImageInfo[];
task_list_items: TaskListItemInfo[];
code_blocks: CodeBlockInfo[];
reactions: ReactionInfo[];
embedded_data_uri_count:number;
outer_html?: string;
}
/** A tiny structural type for Turndown nodes whose attributes we inspect. */
interface TurndownNode {
nodeName: string;
textContent: string | null;
getAttribute(name: string): string | null;
}
/**
* Keep unique SingleFile data URIs out of Cheerio's parser while making them restorable later.
*/
class DataUriStore {
private readonly data_uri_to_record: Map<string, StoredDataUri>;
private readonly placeholder_to_record: Map<string, StoredDataUri>;
/**
* Create an empty data URI store.
*
* @returns A store with no remembered data URIs.
*/
public constructor() {
this.data_uri_to_record = new Map<string, StoredDataUri>();
this.placeholder_to_record = new Map<string, StoredDataUri>();
}
/**
* Remember one data URI and return the short placeholder URL that should replace it in HTML.
*
* @param data_uri - Complete `data:` URI found in a URL-bearing HTML attribute.
* @returns The stored record, including its short placeholder URL and compact summary.
*/
public store(data_uri: string): StoredDataUri {
A(data_uri.startsWith("data:"), "DataUriStore.store expects a data: URI.");
const existing_record = this.data_uri_to_record.get(data_uri);
if (existing_record !== undefined) {
return existing_record;
}
const sequence_number = this.data_uri_to_record.size + 1;
const token_hash = createHash("sha256").update(data_uri).digest("hex").slice(0, 24);
const token = `sfdata-${sequence_number}-${token_hash}`;
const placeholder_url = `${DATA_URI_PLACEHOLDER_PREFIX}${token}`;
const summary = summarize_data_uri(data_uri, token, placeholder_url);
const record = { token, placeholder_url, data_uri, summary };
this.data_uri_to_record.set(data_uri, record);
this.placeholder_to_record.set(placeholder_url, record);
return record;
}
/**
* Look up a stored data URI by the placeholder URL currently present in parsed HTML.
*
* @param url - Attribute value that may be a `data:singlefile-omitted,...` placeholder.
* @returns The stored data URI record, or null when the URL is not one of our placeholders.
*/
public lookup_placeholder(url: MaybeString): StoredDataUri | null {
if (url === null) {
return null;
}
return this.placeholder_to_record.get(url) ?? null;
}
/**
* Count the unique data URIs remembered by this store.
*
* @returns The number of unique embedded payloads encountered during preprocessing.
*/
public unique_count(): number {
return this.data_uri_to_record.size;
}
}
/**
* Parse CLI arguments.
*
* @param argv - Process arguments after the node executable and script path.
* @returns A validated argument object with input/output paths and parser flags.
*/
function parse_cli_args(argv: readonly string[]): CliArgs {
const paths: string[] = [];
const options: ParserOptions = {
include_body_html: true,
include_data_uris: false,
include_outer_html: false,
};
for (const arg of argv) {
A.neq(arg, "", "CLI arguments must not be empty strings.");
if (arg === "--help" || arg === "-h") {
print_usage_and_exit(0);
}
if (arg === "--include-data-uris") {
options.include_data_uris = true;
continue;
}
if (arg === "--exclude-body-html") {
options.include_body_html = false;
continue;
}
if (arg === "--include-outer-html") {
options.include_outer_html = true;
continue;
}
if (arg.startsWith("--")) {
throw new Error(`Unknown option: ${arg}`);
}
paths.push(arg);
}
if (paths.length < 1 || paths.length > 2) {
print_usage_and_exit(1);
}
const input_path = paths[0];
const output_path = paths.length === 2 ? paths[1] : null;
A(input_path, "An input HTML path is required.");
return {
input_path,
output_path,
...options,
};
}
/**
* Print CLI usage and terminate the process.
*
* @param exit_code - The numeric process status to use when exiting.
* @returns This function never returns because it exits the process.
*/
function print_usage_and_exit(exit_code: number): never {
const usage = [
"Usage:",
" npm run parse -- input.html [output.jsonl] [options]",
"",
"Options:",
" --include-data-uris Keep SingleFile data: payloads in emitted HTML/Markdown/URLs.",
" --exclude-body-html Omit rendered comment HTML and keep Markdown/text/extracted fields.",
" --include-outer-html Include each preprocessed full comment container HTML. Usually large.",
" -h, --help Show this help.",
].join("\n");
if (exit_code === 0) {
console.log(usage);
} else {
console.error(usage);
}
process.exit(exit_code);
}
/**
* Read a UTF-8 HTML file from disk.
*
* @param input_path - Filesystem path to the SingleFile HTML export.
* @returns The file contents as a string.
*/
async function read_utf8_file(input_path: string): Promise<string> {
A(input_path, "input_path must be non-empty.");
const html = await readFile(input_path, "utf8");
A.gt(html.length, 0, `Input file is empty: ${input_path}`);
return html;
}
/**
* Write JSONL either to a file or stdout.
*
* @param output_path - Destination path, or null to write to standard output.
* @param records - Comment JSON objects to serialize one per line.
* @returns A promise that resolves after all data has been written.
*/
async function write_jsonl_file(output_path: MaybeString, records: readonly GistCommentJson[]): Promise<void> {
const jsonl = records.map((record) => JSON.stringify(record)).join("\n") + "\n";
if (output_path === null) {
process.stdout.write(jsonl);
return;
}
await writeFile(output_path, jsonl, "utf8");
}
/**
* Parse a SingleFile GitHub Gist page into normalized comment records.
*
* @param html - Complete saved HTML document text.
* @param input_filename - Original input filename for provenance in JSONL records.
* @param options - Flags controlling large optional fields.
* @returns Normalized comment JSON records in page order.
*/
function parse_gist_comments(html: string, input_filename: MaybeString, options: ParserOptions): GistCommentJson[] {
A.gt(html.length, 0, "html must be non-empty.");
const preprocessed = preprocess_singlefile_html(html);
const $ = cheerio.load(preprocessed.html, { scriptingEnabled: false });
const page = parse_page_metadata(html, $, input_filename);
const turndown_service = make_turndown_service();
const comment_groups = find_comment_groups($);
const records: GistCommentJson[] = [];
A.gt(comment_groups.length, 0, "No gist comment groups were found. The page structure may have changed.");
comment_groups.each((zero_based_index, element) => {
const comment_index = zero_based_index + 1;
const comment_group = $(element);
const raw_comment = parse_raw_comment($, comment_group, comment_index, preprocessed.data_uris, options);
const record = normalize_comment(raw_comment, page, turndown_service, options);
records.push(record);
});
audit_records($, records, preprocessed.stats);
return records;
}
/**
* Remove SingleFile and GitHub chrome that is irrelevant to comments before DOM parsing.
*
* @param html - Complete SingleFile HTML text.
* @returns Reduced HTML plus a store for every data URI replaced with a short token.
*/
function preprocess_singlefile_html(html: string): PreprocessedHtml {
A.gt(html.length, 0, "html must be non-empty.");
const data_uris = new DataUriStore();
let reduced_html = html;
const style_result = replace_and_count(
reduced_html,
/<style\b[^>]*>[\s\S]*?<\/style>/giu,
"",
);
reduced_html = style_result.text;
const script_result = replace_and_count(
reduced_html,
/<script\b[^>]*>[\s\S]*?<\/script>/giu,
"",
);
reduced_html = script_result.text;
const template_result = replace_and_count(
reduced_html,
/<template\b[^>]*>[\s\S]*?<\/template>/giu,
"",
);
reduced_html = template_result.text;
const svg_result = replace_and_count(
reduced_html,
/<svg\b[^>]*>[\s\S]*?<\/svg>/giu,
"",
);
reduced_html = svg_result.text;
const details_menu_result = compact_element_and_count(reduced_html, "details-menu");
reduced_html = details_menu_result.text;
const include_fragment_result = compact_element_and_count(reduced_html, "include-fragment");
reduced_html = include_fragment_result.text;
const data_uri_result = replace_data_uri_attributes(reduced_html, data_uris);
reduced_html = data_uri_result.text;
return {
html: reduced_html,
data_uris: data_uris,
stats: {
original_chars: html.length,
preprocessed_chars: reduced_html.length,
removed_style_blocks: style_result.count,
removed_script_blocks: script_result.count,
removed_template_blocks: template_result.count,
removed_svg_blocks: svg_result.count,
compacted_details_menu_blocks: details_menu_result.count,
compacted_include_fragments: include_fragment_result.count,
replaced_data_uri_attributes: data_uri_result.count,
unique_data_uris: data_uris.unique_count(),
},
};
}
/**
* Replace all regex matches while also reporting the number of replacements made.
*
* @param text - Input text to transform.
* @param pattern - Global regular expression describing removable spans.
* @param replacement - Replacement text to use for each match.
* @returns The transformed text and replacement count.
*/
function replace_and_count(text: string, pattern: RegExp, replacement: string): { text: string; count: number } {
A(pattern.global, "replace_and_count requires a global regular expression.");
let count = 0;
const replaced_text = text.replace(pattern, () => {
count += 1;
return replacement;
});
return { text: replaced_text, count };
}
/**
* Remove the contents of a custom element while retaining its opening attributes.
*
* @param html - HTML text to transform.
* @param tag_name - Custom element tag name whose inner fallback UI is irrelevant.
* @returns The transformed HTML and number of compacted elements.
*/
function compact_element_and_count(html: string, tag_name: string): { text: string; count: number } {
A(tag_name, "tag_name must be non-empty.");
let count = 0;
const pattern = new RegExp(`<${escape_regexp(tag_name)}\\b([^>]*)>[\\s\\S]*?<\\/${escape_regexp(tag_name)}>`, "giu");
const text = html.replace(pattern, (_match, attrs: string) => {
count += 1;
return `<${tag_name}${attrs}></${tag_name}>`;
});
return { text, count };
}
/**
* Escape text for use as a literal inside a regular expression.
*
* @param text - Text that should match literally.
* @returns Text with regexp metacharacters escaped.
*/
function escape_regexp(text: string): string {
return text.replace(/[.*+?^${}()|[\]\\]/gu, "\\$&");
}
/**
* Replace long `data:` attribute values with short placeholders and record summaries.
*
* @param html - Reduced HTML that may still contain data URI attributes.
* @param data_uris - Store that receives each unique data URI.
* @returns HTML with data URIs replaced and the number of attribute occurrences replaced.
*/
function replace_data_uri_attributes(html: string, data_uris: DataUriStore): { text: string; count: number } {
let count = 0;
const double_quoted_text = html.replace(/\b(src|href|poster)=(")(data:[^"]*)"/giu, (_match, attr_name: string, quote: string, data_uri: string) => {
count += 1;
return format_replaced_data_uri_attr(attr_name, quote, data_uri, data_uris);
});
const single_quoted_text = double_quoted_text.replace(/\b(src|href|poster)=(')(data:[^']*)'/giu, (_match, attr_name: string, quote: string, data_uri: string) => {
count += 1;
return format_replaced_data_uri_attr(attr_name, quote, data_uri, data_uris);
});
return { text: single_quoted_text, count };
}
/**
* Build one replacement attribute for a data URI value.
*
* @param attr_name - URL-bearing attribute name, such as `src` or `href`.
* @param quote - The quote character used around the original attribute value.
* @param data_uri - Complete original data URI value.
* @param data_uris - Store that receives or reuses the data URI.
* @returns A replacement attribute string using the stored placeholder URL.
*/
function format_replaced_data_uri_attr(attr_name: string, quote: string, data_uri: string, data_uris: DataUriStore): string {
A(attr_name, "attr_name must be non-empty.");
A(quote === "\"" || quote === "'", "quote must be a single or double quote.");
A(data_uri.startsWith("data:"), "data_uri must start with data:.");
const stored_data_uri = data_uris.store(data_uri);
return `${attr_name}=${quote}${stored_data_uri.placeholder_url}${quote}`;
}
/**
* Extract page-level provenance and gist identity.
*
* @param html - Complete original HTML document text, including the SingleFile comment header.
* @param $ - Cheerio document API for the preprocessed page.
* @param input_filename - Original input filename, if one was supplied by the CLI.
* @returns Page metadata suitable for repeating in each JSONL object.
*/
function parse_page_metadata(html: string, $: CheerioAPI, input_filename: MaybeString): PageMetadata {
A.gt(html.length, 0, "html must be non-empty.");
const singlefile_metadata = parse_singlefile_metadata(html);
const title = clean_text($("title").first().text()) || null;
const source_url = singlefile_metadata.source_url;
const gist_parts = parse_gist_url_parts(source_url);
return {
source_url: source_url,
saved_at: singlefile_metadata.saved_at,
gist_owner: gist_parts.owner,
gist_id: gist_parts.gist_id,
gist_url: gist_parts.gist_url,
title: title,
input_filename: input_filename,
};
}
/**
* Parse the SingleFile provenance comment from the top of the HTML file.
*
* @param html - Complete HTML document text.
* @returns The source URL and saved timestamp if SingleFile wrote them.
*/
function parse_singlefile_metadata(html: string): { source_url: MaybeString; saved_at: MaybeString } {
A.gt(html.length, 0, "html must be non-empty.");
const source_match = html.match(/url:\s*(.*?)\s*(?:\n|\r|-->|$)/u);
const saved_match = html.match(/saved date:\s*(.*?)\s*(?:\n|\r|-->|$)/u);
return {
source_url: source_match?.[1]?.trim() || null,
saved_at: saved_match?.[1]?.trim() || null,
};
}
/**
* Split a GitHub Gist URL into owner, gist id, and canonical gist URL.
*
* @param source_url - The page URL saved by SingleFile, if present.
* @returns Nullable gist URL components inferred from the source URL.
*/
function parse_gist_url_parts(source_url: MaybeString): { owner: MaybeString; gist_id: MaybeString; gist_url: MaybeString } {
if (source_url === null) {
return { owner: null, gist_id: null, gist_url: null };
}
const match = source_url.match(/^https:\/\/gist\.github\.com\/([^/?#]+)\/([0-9a-f]+)(?:[/?#].*)?$/iu);
if (match === null) {
return { owner: null, gist_id: null, gist_url: null };
}
const owner = match[1];
const gist_id = match[2];
const gist_url = `${GITHUB_GIST_ORIGIN}/${owner}/${gist_id}`;
return { owner, gist_id, gist_url };
}
/**
* Locate actual gist comment group elements while ignoring unrelated GitHub chrome.
*
* @param $ - Cheerio document API for the parsed page.
* @returns A Cheerio selection containing only rendered gist comments.
*/
function find_comment_groups($: CheerioAPI): HtmlSelection {
const comment_groups = $("div.timeline-comment[id^='gistcomment-']").filter((_index, element) => {
const group = $(element);
const group_id = group.attr("id") ?? "";
return /^gistcomment-\d+$/u.test(group_id) && group.find(".comment-body.markdown-body.js-comment-body").length === 1;
});
return comment_groups;
}
/**
* Extract a raw comment record from a comment group.
*
* @param $ - Cheerio document API for the parsed page.
* @param comment_group - The `div#gistcomment-NNN.timeline-comment` element for one comment.
* @param index - One-based comment index in page order.
* @param data_uris - Store containing placeholders for embedded SingleFile data URIs.
* @param options - Flags controlling whether large raw fields are retained.
* @returns A raw comment with DOM-derived fields and rendered body HTML.
*/
function parse_raw_comment(
$: CheerioAPI,
comment_group: HtmlSelection,
index: number,
data_uris: DataUriStore,
options: ParserOptions,
): RawGistComment {
A.gte(index, 1, "index is one-based and must be positive.");
A.eq(comment_group.length, 1, "comment_group must contain exactly one comment element.");
const container = first_selection(comment_group.closest(".js-comment-container"));
const header = first_selection(comment_group.children(".timeline-comment-header"));
const body = first_selection(comment_group.find(".comment-body.markdown-body.js-comment-body"));
const author_link = first_selection(header.find("a.author"));
const timestamp_link = first_selection(header.find("a.js-timestamp"));
const relative_time = first_selection(timestamp_link.find("relative-time"));
const avatar_img = first_selection(container.find(".TimelineItem-avatar img.avatar-user"));
const edit_form = first_selection(comment_group.children("form.js-comment-update"));
const preview_form = first_selection(edit_form.find(".js-previewable-comment-form"));
const actions_menu = first_selection(header.find("details-menu[src]").not(".js-comment-edit-history-menu"));
const edit_history = first_selection(header.find(".js-comment-edit-history"));
const edit_history_url = first_selection(edit_history.find("details-menu.js-comment-edit-history-menu[src]"));
A.eq(body.length, 1, `Comment ${index} has no rendered Markdown body.`);
const comment_group_id = nullable_attr(comment_group, "id");
A(comment_group_id !== null, `Comment ${index} is missing a gistcomment id.`);
const comment_id = extract_comment_id_from_group_id(comment_group_id);
A(comment_id !== null, `Comment ${index} has malformed comment anchor: ${comment_group_id}`);
const permalink_url = normalize_url(nullable_attr(timestamp_link, "href"));
const edit_action = normalize_url(nullable_attr(edit_form, "action"));
const url_comment_id = first_non_null([
extract_comment_id_from_url(permalink_url),
extract_comment_id_from_url(edit_action),
]);
if (url_comment_id !== null) {
A.eq(comment_id, url_comment_id, `Comment ${index} has inconsistent comment ids.`);
}
const body_html = html_selection_inner_html_for_output($, body, data_uris, options);
const edit_info = {
is_edited: edit_history.length === 1,
update_form_action: edit_action,
update_form_method: nullable_attr(edit_form, "method"),
preview_url: normalize_url(nullable_attr(preview_form, "data-preview-url")),
comment_actions_menu_url: normalize_url(nullable_attr(actions_menu, "src")),
edit_history_url: normalize_url(nullable_attr(edit_history_url, "src")),
};
return {
index: index,
container_classes: class_list(container),
comment_group_id: comment_group_id,
comment_group_classes: class_list(comment_group),
comment_id: comment_id,
author_login: clean_text(author_link.text()) || null,
author_url: normalize_url(nullable_attr(author_link, "href")),
author_link_text: nullable_text(author_link),
author_hovercard_url: normalize_url(nullable_attr(author_link, "data-hovercard-url")),
author_badges: collect_author_badges($, header),
avatar: avatar_img.length === 1 ? parse_avatar(avatar_img, data_uris, options) : null,
permalink_url: permalink_url,
permalink_id: nullable_attr(timestamp_link, "id"),
permalink_fragment: extract_fragment_from_url(permalink_url),
created_at: nullable_attr(relative_time, "datetime"),
created_at_title: nullable_attr(relative_time, "title"),
created_at_text: text_without_descendants($, relative_time, "template") || nullable_text(timestamp_link),
created_at_ms: date_ms_from_iso(nullable_attr(relative_time, "datetime")),
relative_time_template: text_of_descendants($, relative_time, "template"),
edit: edit_info,
body_html: body_html,
body_text: text_without_descendants($, body, "template"),
links: collect_links($, body, data_uris, options),
images: collect_images($, body, data_uris, options),
task_list_items: collect_task_list_items($, body),
code_blocks: collect_code_blocks($, body),
reactions: collect_reactions($, comment_group),
outer_html: options.include_outer_html ? html_selection_outer_html_for_output($, container, data_uris, options) : null,
};
}
/**
* Normalize a raw comment into the final JSONL schema.
*
* @param raw_comment - DOM-shaped comment data from the first parsing phase.
* @param page - Page-level metadata to attach to the comment.
* @param turndown_service - Configured HTML-to-Markdown converter.
* @param options - Flags controlling optional large fields.
* @returns One JSON-serializable comment object.
*/
function normalize_comment(
raw_comment: RawGistComment,
page: PageMetadata,
turndown_service: TurndownService,
options: ParserOptions,
): GistCommentJson {
const body_markdown = html_to_github_markdown(raw_comment.body_html, turndown_service);
const is_current_user = raw_comment.comment_group_classes.includes("current-user");
const is_minimized = raw_comment.comment_group_classes.includes("minimized-comment");
const embedded_data_uri_count = count_embedded_data_uri_summaries(raw_comment);
const record: GistCommentJson = {
record_type: "gist_comment",
page: page,
index: raw_comment.index,
comment_id: raw_comment.comment_id,
comment_anchor: raw_comment.comment_group_id,
permalink_url: raw_comment.permalink_url,
permalink_fragment: raw_comment.permalink_fragment,
author: {
login: raw_comment.author_login,
url: raw_comment.author_url,
link_text: raw_comment.author_link_text,
hovercard_url: raw_comment.author_hovercard_url,
badges: raw_comment.author_badges,
},
avatar: raw_comment.avatar,
created_at: raw_comment.created_at,
created_at_title: raw_comment.created_at_title,
created_at_text: raw_comment.created_at_text,
created_at_ms: raw_comment.created_at_ms,
relative_time_template: raw_comment.relative_time_template,
is_current_user: is_current_user,
is_author: raw_comment.author_badges.includes("Author"),
is_edited: raw_comment.edit.is_edited,
is_minimized: is_minimized,
container_classes: raw_comment.container_classes,
comment_group_classes: raw_comment.comment_group_classes,
edit: raw_comment.edit,
edit_form_action: raw_comment.edit.update_form_action,
preview_url: raw_comment.edit.preview_url,
body_markdown: body_markdown,
body_text: raw_comment.body_text,
links: raw_comment.links,
images: raw_comment.images,
task_list_items: raw_comment.task_list_items,
code_blocks: raw_comment.code_blocks,
reactions: raw_comment.reactions,
embedded_data_uri_count: embedded_data_uri_count,
};
if (options.include_body_html) {
record.body_html = raw_comment.body_html;
}
if (options.include_outer_html && raw_comment.outer_html !== null) {
record.outer_html = raw_comment.outer_html;
}
return record;
}
/**
* Count embedded data URI summaries referenced by one raw comment.
*
* @param raw_comment - Raw comment whose extracted fields may include data URI summaries.
* @returns Count of avatar, image, and link data URI payloads referenced by the comment.
*/
function count_embedded_data_uri_summaries(raw_comment: RawGistComment): number {
let count = 0;
if (raw_comment.avatar?.embedded_data_uri !== null && raw_comment.avatar?.embedded_data_uri !== undefined) {
count += 1;
}
for (const link of raw_comment.links) {
if (link.embedded_data_uri !== null) {
count += 1;
}
}
for (const image of raw_comment.images) {
if (image.embedded_data_uri !== null) {
count += 1;
}
}
return count;
}
/**
* Build a Turndown converter configured for GitHub-flavored Markdown.
*
* @returns A Turndown service with GFM tables, strikethrough, and task-list support.
*/
function make_turndown_service(): TurndownService {
const turndown_service = new TurndownService({
headingStyle: "atx",
hr: "---",
bulletListMarker: "-",
codeBlockStyle: "fenced",
fence: "```",
emDelimiter: "_",
strongDelimiter: "**",
linkStyle: "inlined",
});
turndown_service.use(gfm);
turndown_service.remove(["script", "style", "template"]);
turndown_service.addRule("github_autolink_like_anchors", {
filter: (node: TurndownNode): boolean => {
if (node.nodeName.toLowerCase() !== "a") {
return false;
}
const href = node.getAttribute("href");
if (href === null || href === "") {
return false;
}
const text = clean_text(node.textContent ?? "");
// GitHub often renders bare URLs as anchors whose text omits the scheme.
return text === href || text === strip_url_scheme(href);
},
replacement: (_content: string, node: TurndownNode): string => {
const href = node.getAttribute("href");
A(href, "Autolink replacement requires href.");
return `<${href}>`;
},
});
turndown_service.addRule("singlefile_hidden_elements", {
filter: (node: TurndownNode): boolean => {
const classes = node.getAttribute("class") ?? "";
// SingleFile may leave hidden UI fragments near content; comments should not include them.
return classes.split(/\s+/u).includes("sf-hidden");
},
replacement: (): string => "",
});
return turndown_service;
}
/**
* Convert rendered GitHub comment HTML into normalized GitHub-flavored Markdown.
*
* @param body_html - Inner HTML of `.comment-body.markdown-body`.
* @param turndown_service - Configured Turndown converter.
* @returns Markdown text suitable for a JSON string field.
*/
function html_to_github_markdown(body_html: string, turndown_service: TurndownService): string {
A.gte(body_html.length, 0, "body_html must be a string.");
const markdown = turndown_service.turndown(body_html);
return normalize_markdown(markdown);
}
/**
* Clean Markdown whitespace produced from GitHub's rendered HTML.
*
* @param markdown - Raw Markdown from Turndown.
* @returns Markdown with stable blank lines and no surrounding whitespace.
*/
function normalize_markdown(markdown: string): string {
const unix_markdown = markdown.replace(/\r\n?/gu, "\n");
const trimmed_lines = unix_markdown.split("\n").map((line) => line.trimEnd());
const collapsed_blanks = trimmed_lines.join("\n").replace(/\n{3,}/gu, "\n\n");
const trimmed_markdown = collapsed_blanks.trim();
return trimmed_markdown;
}
/**
* Parse avatar metadata from the avatar image near a comment.
*
* @param avatar_img - Cheerio selection for exactly one avatar `<img>` element.
* @param data_uris - Store containing placeholder-to-data-URI mappings.
* @param options - Flags controlling whether embedded data URIs are retained.
* @returns Structured avatar information.
*/
function parse_avatar(avatar_img: HtmlSelection, data_uris: DataUriStore, options: ParserOptions): AvatarInfo {
A.eq(avatar_img.length, 1, "avatar_img must contain exactly one image.");
const src_resolution = resolve_url_attr(avatar_img, "src", data_uris, options);
return {
alt: nullable_attr(avatar_img, "alt"),
width: integer_attr(avatar_img, "width"),
height: integer_attr(avatar_img, "height"),
src: src_resolution.url,
original_src: normalize_url(nullable_attr(avatar_img, "data-sf-original-src")),
embedded_data_uri: src_resolution.embedded_data_uri,
};
}
/**
* Collect visible author badges, such as GitHub's `Author` label.
*
* @param $ - Cheerio document API for the parsed page.
* @param header - Comment header for one gist comment.
* @returns Badge text values in DOM order.
*/
function collect_author_badges($: CheerioAPI, header: HtmlSelection): string[] {
const badges: string[] = [];
header.find(".Label").each((_index, element) => {
const badge_text = clean_text($(element).text());
if (badge_text !== "") {
badges.push(badge_text);
}
});
return unique_strings(badges);
}
/**
* Collect hyperlinks inside the rendered comment body.
*
* @param $ - Cheerio document API for the parsed page.
* @param body - Cheerio selection for one `.comment-body.markdown-body` element.
* @param data_uris - Store containing placeholder-to-data-URI mappings.
* @param options - Flags controlling whether embedded data URIs are retained.
* @returns Link metadata in body order.
*/
function collect_links($: CheerioAPI, body: HtmlSelection, data_uris: DataUriStore, options: ParserOptions): LinkInfo[] {
const links: LinkInfo[] = [];
body.find("a[href]").each((zero_based_index, element) => {
const link = $(element);
const href_resolution = resolve_url_attr(link, "href", data_uris, options);
links.push({
index: zero_based_index + 1,
text: clean_text(link.text()),
href: href_resolution.url,
original_href: normalize_url(nullable_attr(link, "data-sf-original-href")),
title: nullable_attr(link, "title"),
rel: nullable_attr(link, "rel"),
target: nullable_attr(link, "target"),
aria_label: nullable_attr(link, "aria-label"),
classes: class_list(link),
embedded_data_uri: href_resolution.embedded_data_uri,
});
});
return links;
}
/**
* Collect images inside the rendered comment body.
*
* @param $ - Cheerio document API for the parsed page.
* @param body - Cheerio selection for one `.comment-body.markdown-body` element.
* @param data_uris - Store containing placeholder-to-data-URI mappings.
* @param options - Flags controlling whether embedded data URIs are retained.
* @returns Image metadata in body order.
*/
function collect_images($: CheerioAPI, body: HtmlSelection, data_uris: DataUriStore, options: ParserOptions): ImageInfo[] {
const images: ImageInfo[] = [];
body.find("img").each((zero_based_index, element) => {
const image = $(element);
const src_resolution = resolve_url_attr(image, "src", data_uris, options);
images.push({
index: zero_based_index + 1,
alt: nullable_attr(image, "alt"),
title: nullable_attr(image, "title"),
width: integer_attr(image, "width"),
height: integer_attr(image, "height"),
src: src_resolution.url,
original_src: normalize_url(nullable_attr(image, "data-sf-original-src")),
classes: class_list(image),
embedded_data_uri: src_resolution.embedded_data_uri,
});
});
return images;
}
/**
* Collect rendered GitHub task-list checkboxes inside a comment body.
*
* @param $ - Cheerio document API for the parsed page.
* @param body - Cheerio selection for one `.comment-body.markdown-body` element.
* @returns Task-list items in body order.
*/
function collect_task_list_items($: CheerioAPI, body: HtmlSelection): TaskListItemInfo[] {
const task_list_items: TaskListItemInfo[] = [];
body.find("li.task-list-item").each((zero_based_index, element) => {
const item = $(element);
const checkbox = first_selection(item.find("input[type='checkbox']"));
task_list_items.push({
index: zero_based_index + 1,
checked: nullable_attr(checkbox, "checked") !== null,
text: clean_text(item.text()),
});
});
return task_list_items;
}
/**
* Collect code blocks inside a rendered Markdown body.
*
* @param $ - Cheerio document API for the parsed page.
* @param body - Cheerio selection for one `.comment-body.markdown-body` element.
* @returns Code blocks in body order, including any detected language class.
*/
function collect_code_blocks($: CheerioAPI, body: HtmlSelection): CodeBlockInfo[] {
const code_blocks: CodeBlockInfo[] = [];
body.find("pre > code").each((zero_based_index, element) => {
const code_element = $(element);
const classes = class_list(code_element);
const language = first_non_null(classes.map((class_name) => extract_language_from_code_class(class_name)));
code_blocks.push({
index: zero_based_index + 1,
language: language,
code: code_element.text().replace(/\r\n?/gu, "\n"),
classes: classes,
});
});
return code_blocks;
}
/**
* Extract a Markdown code-fence language from a GitHub code class.
*
* @param class_name - One CSS class from a `<code>` element.
* @returns The language suffix, or null when the class is not language-bearing.
*/
function extract_language_from_code_class(class_name: string): MaybeString {
const match = class_name.match(/^language-(.+)$/u);
return match?.[1] ?? null;
}
/**
* Collect GitHub reaction summaries if they were present in the saved HTML.
*
* @param $ - Cheerio document API for the parsed page.
* @param comment_group - Comment group for one gist comment.
* @returns Reaction metadata in DOM order.
*/
function collect_reactions($: CheerioAPI, comment_group: HtmlSelection): ReactionInfo[] {
const reactions: ReactionInfo[] = [];
const selector = [
".comment-reactions .social-reaction-summary-item",
".comment-reactions [data-reaction]",
".comment-reactions [data-reaction-label]",
].join(", ");
comment_group.find(selector).each((zero_based_index, element) => {
const reaction = $(element);
const count = first_non_null([
integer_text(first_selection(reaction.find(".Counter")).text()),
integer_text(reaction.text()),
]);
reactions.push({
index: zero_based_index + 1,
label: nullable_attr(reaction, "aria-label") ?? nullable_attr(reaction, "data-reaction-label"),
emoji: clean_text(first_selection(reaction.find("g-emoji, .emoji")).text()) || null,
count: count,
reacted_by_current_user: class_list(reaction).includes("user-has-reacted"),
classes: class_list(reaction),
});
});
return reactions;
}
/**
* Return sanitized inner HTML for a selection after resolving SingleFile data URI placeholders.
*
* @param $ - Cheerio document API for the parsed page.
* @param selection - Selection whose inner HTML should be serialized.
* @param data_uris - Store containing placeholder-to-data-URI mappings.
* @param options - Flags controlling whether embedded data URIs are retained.
* @returns Inner HTML with data URI placeholders resolved consistently with parser options.
*/
function html_selection_inner_html_for_output(
$: CheerioAPI,
selection: HtmlSelection,
data_uris: DataUriStore,
options: ParserOptions,
): string {
A.eq(selection.length, 1, "selection must contain exactly one element.");
const clone = selection.clone();
rewrite_data_uri_placeholders_in_tree($, clone, data_uris, options);
return clone.html() ?? "";
}
/**
* Return sanitized outer HTML for a selection after resolving SingleFile data URI placeholders.
*
* @param $ - Cheerio document API for the parsed page.
* @param selection - Selection whose outer HTML should be serialized.
* @param data_uris - Store containing placeholder-to-data-URI mappings.
* @param options - Flags controlling whether embedded data URIs are retained.
* @returns Outer HTML with data URI placeholders resolved consistently with parser options.
*/
function html_selection_outer_html_for_output(
$: CheerioAPI,
selection: HtmlSelection,
data_uris: DataUriStore,
options: ParserOptions,
): string {
A.eq(selection.length, 1, "selection must contain exactly one element.");
const clone = selection.clone();
rewrite_data_uri_placeholders_in_tree($, clone, data_uris, options);
return $.html(clone);
}
/**
* Rewrite placeholder URLs inside a cloned subtree for HTML or Markdown output.
*
* @param $ - Cheerio document API for the parsed page.
* @param root - Cloned root whose URL-bearing attributes may contain data URI placeholders.
* @param data_uris - Store containing placeholder-to-data-URI mappings.
* @param options - Flags controlling whether embedded data URIs are retained.
* @returns Nothing; the cloned subtree is mutated in place.
*/
function rewrite_data_uri_placeholders_in_tree(
$: CheerioAPI,
root: HtmlSelection,
data_uris: DataUriStore,
options: ParserOptions,
): void {
const elements = root.find("[src], [href], [poster]").add(root.filter("[src], [href], [poster]"));
elements.each((_index, element) => {
const selected_element = $(element);
rewrite_data_uri_placeholder_attr(selected_element, "src", data_uris, options);
rewrite_data_uri_placeholder_attr(selected_element, "href", data_uris, options);
rewrite_data_uri_placeholder_attr(selected_element, "poster", data_uris, options);
});
}
/**
* Rewrite one URL-bearing attribute when it contains one of our data URI placeholders.
*
* @param element - Element that may carry the URL-bearing attribute.
* @param attr_name - Attribute name to inspect and possibly rewrite.
* @param data_uris - Store containing placeholder-to-data-URI mappings.
* @param options - Flags controlling whether embedded data URIs are retained.
* @returns Nothing; the element is mutated when a placeholder is found.
*/
function rewrite_data_uri_placeholder_attr(
element: HtmlSelection,
attr_name: "src" | "href" | "poster",
data_uris: DataUriStore,
options: ParserOptions,
): void {
const raw_url = nullable_attr(element, attr_name);
const stored = data_uris.lookup_placeholder(raw_url);
if (stored === null) {
return;
}
const original_url = original_url_for_attr(element, attr_name);
const output_url = options.include_data_uris ? stored.data_uri : original_url ?? omitted_data_uri_url(stored.summary);
element.attr(attr_name, output_url);
element.attr(`data-singlefile-embedded-${attr_name}-sha256`, stored.summary.payload_sha256);
if (!options.include_data_uris) {
element.attr(`data-singlefile-embedded-${attr_name}-omitted`, "true");
}
}
/**
* Resolve one URL-bearing attribute for structured JSON fields.
*
* @param element - Element carrying the URL-bearing attribute.
* @param attr_name - Attribute name to inspect and resolve.
* @param data_uris - Store containing placeholder-to-data-URI mappings.
* @param options - Flags controlling whether embedded data URIs are retained.
* @returns A resolved URL and optional embedded data URI summary.
*/
function resolve_url_attr(
element: HtmlSelection,
attr_name: "src" | "href" | "poster",
data_uris: DataUriStore,
options: ParserOptions,
): { url: MaybeString; embedded_data_uri: EmbeddedDataUriSummary | null } {
const raw_url = nullable_attr(element, attr_name);
const stored = data_uris.lookup_placeholder(raw_url);
if (stored === null) {
return { url: normalize_url(raw_url), embedded_data_uri: null };
}
const original_url = original_url_for_attr(element, attr_name);
const output_url = options.include_data_uris ? stored.data_uri : original_url ?? omitted_data_uri_url(stored.summary);
return {
url: normalize_url(output_url),
embedded_data_uri: stored.summary,
};
}
/**
* Read SingleFile's original URL attribute for a rewritten URL-bearing attribute.
*
* @param element - Element whose SingleFile metadata should be inspected.
* @param attr_name - URL-bearing attribute whose original value is wanted.
* @returns The normalized original URL, or null when SingleFile did not record one.
*/
function original_url_for_attr(element: HtmlSelection, attr_name: "src" | "href" | "poster"): MaybeString {
if (attr_name === "src") {
return normalize_url(nullable_attr(element, "data-sf-original-src"));
}
if (attr_name === "href") {
return normalize_url(nullable_attr(element, "data-sf-original-href"));
}
return normalize_url(nullable_attr(element, "data-sf-original-poster"));
}
/**
* Produce a short stand-in URL when a data URI has no SingleFile original URL.
*
* @param summary - Summary of the omitted data URI payload.
* @returns A stable placeholder URL carrying the payload hash.
*/
function omitted_data_uri_url(summary: EmbeddedDataUriSummary): string {
return `${DATA_URI_PLACEHOLDER_PREFIX}sha256-${summary.payload_sha256}`;
}
/**
* Return the first selection when a function may receive an empty selection.
*
* @param selection - Cheerio selection whose first element is desired.
* @returns A Cheerio selection with zero or one element.
*/
function first_selection(selection: HtmlSelection): HtmlSelection {
return selection.first();
}
/**
* Convert an attribute to a trimmed nullable string.
*
* @param selection - Cheerio selection whose first element may carry the attribute.
* @param attr_name - HTML attribute name to read.
* @returns The trimmed attribute value, or null when missing/blank.
*/
function nullable_attr(selection: HtmlSelection, attr_name: string): MaybeString {
A(attr_name, "attr_name must be non-empty.");
if (selection.length === 0) {
return null;
}
const value = selection.attr(attr_name);
if (value === undefined) {
return null;
}
const trimmed_value = value.trim();
return trimmed_value === "" ? null : trimmed_value;
}
/**
* Convert a selection's text to a trimmed nullable string.
*
* @param selection - Cheerio selection whose text should be read.
* @returns Cleaned text, or null when missing/blank.
*/
function nullable_text(selection: HtmlSelection): MaybeString {
if (selection.length === 0) {
return null;
}
const text = clean_text(selection.text());
return text === "" ? null : text;
}
/**
* Read text from a selection after removing descendants that should not count as visible text.
*
* @param $ - Cheerio document API for the parsed page.
* @param selection - Selection whose text should be read.
* @param descendants_selector - Descendant selector to remove from a clone before reading text.
* @returns Cleaned text from the clone, or null when the selection is empty or blank.
*/
function text_without_descendants($: CheerioAPI, selection: HtmlSelection, descendants_selector: string): MaybeString {
A(descendants_selector, "descendants_selector must be non-empty.");
if (selection.length === 0) {
return null;
}
const clone = selection.clone();
clone.find(descendants_selector).remove();
const text = clean_text(clone.text());
return text === "" ? null : text;
}
/**
* Read text from matching descendants inside a selection.
*
* @param $ - Cheerio document API for the parsed page.
* @param selection - Selection whose descendants should be read.
* @param descendants_selector - Descendant selector whose text should be collected.
* @returns Cleaned descendant text, or null when there is none.
*/
function text_of_descendants($: CheerioAPI, selection: HtmlSelection, descendants_selector: string): MaybeString {
A(descendants_selector, "descendants_selector must be non-empty.");
if (selection.length === 0) {
return null;
}
const texts: string[] = [];
selection.find(descendants_selector).each((_index, element) => {
const text = clean_text($(element).text());
if (text !== "") {
texts.push(text);
}
});
return texts.length === 0 ? null : texts.join(" ");
}
/**
* Parse an integer-valued HTML attribute.
*
* @param selection - Cheerio selection whose first element may carry the attribute.
* @param attr_name - HTML attribute name to parse.
* @returns Parsed integer, or null when missing/not numeric.
*/
function integer_attr(selection: HtmlSelection, attr_name: string): MaybeNumber {
const value = nullable_attr(selection, attr_name);
if (value === null) {
return null;
}
const parsed_value = Number.parseInt(value, 10);
return Number.isFinite(parsed_value) ? parsed_value : null;
}
/**
* Parse the first integer visible in a text string.
*
* @param text - Text that may contain a count.
* @returns The first parsed integer, or null when no integer is present.
*/
function integer_text(text: string): MaybeNumber {
const match = text.match(/\d+/u);
if (match === null) {
return null;
}
const parsed_value = Number.parseInt(match[0], 10);
return Number.isFinite(parsed_value) ? parsed_value : null;
}
/**
* Parse a date string into epoch milliseconds.
*
* @param iso_datetime - ISO-like datetime string from GitHub's `relative-time` element.
* @returns Epoch milliseconds, or null when missing/unparseable.
*/
function date_ms_from_iso(iso_datetime: MaybeString): MaybeNumber {
if (iso_datetime === null) {
return null;
}
const parsed_ms = Date.parse(iso_datetime);
return Number.isFinite(parsed_ms) ? parsed_ms : null;
}
/**
* Convert an element's class attribute to a list.
*
* @param selection - Cheerio selection whose first element may carry classes.
* @returns Class names in document order, without blank entries.
*/
function class_list(selection: HtmlSelection): string[] {
const raw_classes = nullable_attr(selection, "class");
if (raw_classes === null) {
return [];
}
return raw_classes.split(/\s+/u).filter((class_name) => class_name !== "");
}
/**
* Clean human-readable text while preserving it as a single JSON value.
*
* @param text - Raw text from Cheerio or a DOM node.
* @returns Text with collapsed whitespace and trimmed edges.
*/
function clean_text(text: string): string {
return text.replace(/\s+/gu, " ").trim();
}
/**
* Normalize URLs captured from GitHub markup.
*
* @param url - A raw href/src/action value from the page.
* @returns An absolute GitHub URL for root-relative paths, otherwise the original URL.
*/
function normalize_url(url: MaybeString): MaybeString {
if (url === null) {
return null;
}
if (url.startsWith("//")) {
return `https:${url}`;
}
if (url.startsWith("/")) {
return `${GITHUB_GIST_ORIGIN}${url}`;
}
return url;
}
/**
* Remove the scheme from a URL for comparing GitHub's rendered bare-link text.
*
* @param url - Absolute URL or any string that may begin with a URL scheme.
* @returns The URL without a leading `http://` or `https://` scheme.
*/
function strip_url_scheme(url: string): string {
return url.replace(/^https?:\/\//iu, "");
}
/**
* Extract a numeric comment id from a `gistcomment-...` element id.
*
* @param group_id - Element id from the comment group, if present.
* @returns The numeric comment id as a string, or null when not encoded.
*/
function extract_comment_id_from_group_id(group_id: MaybeString): MaybeString {
if (group_id === null) {
return null;
}
const match = group_id.match(/^gistcomment-(\d+)$/u);
return match?.[1] ?? null;
}
/**
* Extract a numeric comment id from a GitHub permalink or comment API path.
*
* @param url - URL or path that may contain `permalink_comment_id`, `gistcomment-`, or `/comments/`.
* @returns The numeric comment id as a string, or null when no id is encoded.
*/
function extract_comment_id_from_url(url: MaybeString): MaybeString {
if (url === null) {
return null;
}
const patterns = [
/[?&]permalink_comment_id=(\d+)/u,
/#gistcomment-(\d+)/u,
/\/comments\/(\d+)(?:\b|\/|$)/u,
];
for (const pattern of patterns) {
const match = url.match(pattern);
if (match !== null) {
return match[1];
}
}
return null;
}
/**
* Extract the fragment identifier from a URL string.
*
* @param url - URL that may contain a `#fragment` component.
* @returns Fragment text without the hash, or null when absent.
*/
function extract_fragment_from_url(url: MaybeString): MaybeString {
if (url === null) {
return null;
}
const hash_index = url.indexOf("#");
if (hash_index < 0 || hash_index === url.length - 1) {
return null;
}
return url.slice(hash_index + 1);
}
/**
* Return the first non-null value from a list.
*
* @param values - Candidate values ordered from most to least preferred.
* @returns The first non-null value, or null if all candidates are null.
*/
function first_non_null<T>(values: readonly (T | null)[]): T | null {
for (const value of values) {
if (value !== null) {
return value;
}
}
return null;
}
/**
* Remove repeated strings while preserving first-seen order.
*
* @param values - String values that may contain duplicates.
* @returns A list with duplicates removed.
*/
function unique_strings(values: readonly string[]): string[] {
const seen: Set<string> = new Set();
const unique_values: string[] = [];
for (const value of values) {
if (!seen.has(value)) {
seen.add(value);
unique_values.push(value);
}
}
return unique_values;
}
/**
* Summarize a data URI without storing its large payload repeatedly in every JSON record.
*
* @param data_uri - Complete `data:` URI from SingleFile.
* @param token - Stable short token assigned by the data URI store.
* @param placeholder_url - Placeholder URL used in preprocessed HTML.
* @returns Media type, byte count when available, and a SHA-256 hash of the payload bytes.
*/
function summarize_data_uri(data_uri: string, token: string, placeholder_url: string): EmbeddedDataUriSummary {
A(data_uri.startsWith("data:"), "summarize_data_uri expects a data: URI.");
const comma_index = data_uri.indexOf(",");
A.gte(comma_index, 0, "data: URI must contain a comma separator.");
const metadata = data_uri.slice("data:".length, comma_index);
const payload_text = data_uri.slice(comma_index + 1);
const metadata_parts = metadata.split(";").filter((part) => part !== "");
const media_type = metadata_parts[0] && !metadata_parts[0].includes("=") ? metadata_parts[0] : "text/plain";
const is_base64 = metadata_parts.includes("base64");
const decoded = decode_data_uri_payload(payload_text, is_base64);
const payload_hash = createHash("sha256").update(decoded.bytes).digest("hex");
return {
token: token,
placeholder_url: placeholder_url,
media_type: media_type,
is_base64: is_base64,
payload_sha256: payload_hash,
payload_bytes: decoded.decode_error === null ? decoded.bytes.length : null,
data_uri_chars: data_uri.length,
decode_error: decoded.decode_error,
};
}
/**
* Decode the payload portion of a data URI.
*
* @param payload_text - Text after the comma in a data URI.
* @param is_base64 - Whether the metadata declared base64 encoding.
* @returns Decoded payload bytes, plus an error string when percent decoding failed.
*/
function decode_data_uri_payload(payload_text: string, is_base64: boolean): { bytes: Buffer; decode_error: MaybeString } {
if (is_base64) {
return { bytes: Buffer.from(payload_text, "base64"), decode_error: null };
}
try {
return { bytes: Buffer.from(decodeURIComponent(payload_text), "utf8"), decode_error: null };
} catch (error: unknown) {
const message = error instanceof Error ? error.message : String(error);
// XXX: Non-base64 data URIs are usually percent-encoded UTF-8, but the standard allows other charsets.
return { bytes: Buffer.from(payload_text, "utf8"), decode_error: message };
}
}
/**
* Validate structural invariants after parsing the page.
*
* @param $ - Cheerio document API for the preprocessed page.
* @param records - Parsed comment records that should correspond one-to-one with body nodes.
* @param stats - Preprocessing statistics useful for consistency checks.
* @returns Nothing; throws if a parser invariant is violated.
*/
function audit_records($: CheerioAPI, records: readonly GistCommentJson[], stats: HtmlPreprocessStats): void {
A.gt(records.length, 0, "audit_records requires at least one record.");
const body_count = $(".comment-body.markdown-body.js-comment-body").length;
const group_count = find_comment_groups($).length;
const ids = new Set<string>();
A.eq(records.length, body_count, "Record count must equal rendered comment body count.");
A.eq(records.length, group_count, "Record count must equal gist comment group count.");
A.gte(stats.replaced_data_uri_attributes, stats.unique_data_uris, "Unique data URI count cannot exceed replacement count.");
records.forEach((record, zero_based_index) => {
A.eq(record.index, zero_based_index + 1, "Record indices must be sequential and one-based.");
A(/^\d+$/u.test(record.comment_id), `Record ${record.index} has a non-numeric comment id.`);
A(!ids.has(record.comment_id), `Duplicate comment id: ${record.comment_id}`);
A(record.comment_anchor === `gistcomment-${record.comment_id}`, `Record ${record.index} has mismatched anchor/id.`);
A(record.created_at !== null, `Record ${record.index} is missing created_at.`);
A(record.author.login !== null, `Record ${record.index} is missing author login.`);
ids.add(record.comment_id);
});
}
/**
* Main program entry point.
*
* @param argv - Process arguments after executable and script name.
* @returns A promise that resolves after parsing and writing JSONL.
*/
async function main(argv: readonly string[]): Promise<void> {
const args = parse_cli_args(argv);
const html = await read_utf8_file(args.input_path);
const input_filename = basename(args.input_path);
const records = parse_gist_comments(html, input_filename, args);
await write_jsonl_file(args.output_path, records);
if (args.output_path !== null) {
console.error(`Wrote ${records.length} comments to ${args.output_path}`);
}
}
main(process.argv.slice(2)).catch((error: unknown) => {
const message = error instanceof Error ? error.message : String(error);
console.error(message);
process.exitCode = 1;
});
TS