| 148 | 150 | | yield descriptor; |
| 149 | 151 | | } |
| 150 | 152 | | } |
| 153 | + | |
| 154 | + | /** |
| 155 | + | * Stable JSON serialization. Keys are sorted recursively so semantically |
| 156 | + | * equal objects produce identical strings regardless of property |
| 157 | + | * insertion order. `undefined` values are omitted (treated as "not set"), |
| 158 | + | * matching how the rest of the export pipeline interprets missing |
| 159 | + | * descriptor fields. |
| 160 | + | * |
| 161 | + | * Not a general-purpose stable-stringify — no cycle detection, no Date / |
| 162 | + | * Map / Set / RegExp special-casing. Descriptors are plain JSON-shaped |
| 163 | + | * objects (the documented contract), so the simple recursion is safe. |
| 164 | + | * If a user ever passes a `Headers` instance, it'll serialize as `{}` |
| 165 | + | * and dedup that case incorrectly — document plain-object headers as |
| 166 | + | * the contract; don't auto-detect, because guessing is worse than a |
| 167 | + | * predictable miss. |
| 168 | + | */ |
| 169 | + | function stableStringify(value) { |
| 170 | + | if (value === undefined) return undefined; |
| 171 | + | if (value === null) return "null"; |
| 172 | + | if (typeof value !== "object") return JSON.stringify(value); |
| 173 | + | if (Array.isArray(value)) { |
| 174 | + | return "[" + value.map((v) => stableStringify(v) ?? "null").join(",") + "]"; |
| 175 | + | } |
| 176 | + | const parts = []; |
| 177 | + | for (const k of Object.keys(value).sort()) { |
| 178 | + | const sv = stableStringify(value[k]); |
| 179 | + | if (sv === undefined) continue; |
| 180 | + | parts.push(JSON.stringify(k) + ":" + sv); |
| 181 | + | } |
| 182 | + | return "{" + parts.join(",") + "}"; |
| 183 | + | } |
| 184 | + | |
| 185 | + | /** |
| 186 | + | * Dedup key for an export descriptor: the *entire descriptor*, stably |
| 187 | + | * serialized. Two descriptors are dedup-equivalent only when they are |
| 188 | + | * structurally identical — same path, same filename, same headers, same |
| 189 | + | * prerender/rsc/outlet/remote/origin/host, everything. |
| 190 | + | * |
| 191 | + | * Why exact match and not e.g. `filename`-only: |
| 192 | + | * |
| 193 | + | * - `filename` collisions across distinct descriptors are a real case: |
| 194 | + | * two descriptors yielding the same output path but with different |
| 195 | + | * `prerender` settings produce different *sidecar* artifacts |
| 196 | + | * (postpone state, prerender cache). Deduping on filename alone |
| 197 | + | * would silently drop one of those renders. |
| 198 | + | * - Headers affect rendered HTML (content-negotiation, locale). Two |
| 199 | + | * descriptors differing only in `accept` headers should both render |
| 200 | + | * even if they share path + filename — the user is responsible for |
| 201 | + | * ensuring distinct filenames if they want both artifacts on disk. |
| 202 | + | * |
| 203 | + | * The conservative rule — "only skip work if every input is identical" — |
| 204 | + | * means dedup never silently changes output. The case it actually catches |
| 205 | + | * is the common bug: a generator yielding the same descriptor twice by |
| 206 | + | * accident (overlapping CMS pages, doubly-walked manifest, etc.). |
| 207 | + | */ |
| 208 | + | function dedupeKey(item) { |
| 209 | + | return stableStringify(item); |
| 210 | + | } |
| 211 | + | |
| 212 | + | /** |
| 213 | + | * Streaming dedup. Drops items whose `dedupeKey` was already emitted. |
| 214 | + | * |
| 215 | + | * Memory model: a `Set<string>` keyed on a 128-bit SHAKE256 digest of the |
| 216 | + | * dedupe key (latin1-encoded for compactness — 16 bytes/key, no encoding |
| 217 | + | * expansion). Bounded per-key cost regardless of path length, with a |
| 218 | + | * collision probability around 10⁻²⁰ for 10M entries — below hardware |
| 219 | + | * bit-flip rates, indistinguishable from exact dedup in practice. |
| 220 | + | * |
| 221 | + | * Soft cap: past `limit` unique entries we stop deduping and warn rather |
| 222 | + | * than dropping anything. Correctness above all: the worst case is "we |
| 223 | + | * emit a duplicate write" (the historic behavior), never "we silently |
| 224 | + | * skip a unique page." If you hit the cap, your source likely has a bug |
| 225 | + | * or you've outgrown a single-build static export — the warning routes |
| 226 | + | * you to that conversation rather than failing silently. |
| 227 | + | */ |
| 228 | + | export async function* dedupedPathStream( |
| 229 | + | stream, |
| 230 | + | { limit = 1_000_000, onDuplicate, onCapExceeded } = {} |
| 231 | + | ) { |
| 232 | + | const seen = new Set(); |
| 233 | + | let capWarned = false; |
| 234 | + | for await (const item of stream) { |
| 235 | + | // SHAKE256 with 16-byte output = 128-bit hash. Native node:crypto, no |
| 236 | + | // dep. latin1 keeps the Set key at 16 bytes/char — hex would double |
| 237 | + | // it, base64 is 22 chars and slower to encode. |
| 238 | + | const key = createHash("shake256", { outputLength: 16 }) |
| 239 | + | .update(dedupeKey(item)) |
| 240 | + | .digest("latin1"); |
| 241 | + | |
| 242 | + | if (seen.has(key)) { |
| 243 | + | onDuplicate?.(item); |
| 244 | + | continue; |
| 245 | + | } |
| 246 | + | if (seen.size >= limit) { |
| 247 | + | if (!capWarned) { |
| 248 | + | capWarned = true; |
| 249 | + | onCapExceeded?.(limit); |
| 250 | + | } |
| 251 | + | // Past the cap we yield without remembering — duplicates from here |
| 252 | + | // on will pass through, but no unique page is ever dropped. |
| 253 | + | yield item; |
| 254 | + | continue; |
| 255 | + | } |
| 256 | + | seen.add(key); |
| 257 | + | yield item; |
| 258 | + | } |
| 259 | + | } |