The Atlas BigLaw / Big Michael — documentation bound to its code
7 documents

Lawyer voice fingerprinting

Turn a lawyer's LinkedIn writing history into a reusable ToneProfile via a MapReduce of Haiku calls, then watch that voice get injected into every drafting agent and the final synthesis.

src/services/toneAnalyzer.ts217 lines · analyzeTone L193–216
Outline 12 symbols
1// SPDX-License-Identifier: AGPL-3.0-only
2// Copyright (C) 2026 Discover Legal
3
4import Anthropic from "@anthropic-ai/sdk";
5import { Config } from "../config.js";
6import { logger } from "../logger.js";
7import { costStore, calcCostUsd } from "../cost/index.js";
8import { sanitizePromptContent } from "../adapters/lavern.js";
9import type { ToneProfile } from "../types.js";
10
11const client = new Anthropic({ apiKey: Config.anthropic.apiKey });
12
13// Posts per leaf-level analysis chunk. Small enough to fit comfortably in
14// a single Haiku call; large enough to show meaningful style patterns.
15const POST_CHUNK_SIZE = 8;
16
17// Style notes per rollup chunk. Notes are longer than raw posts so we
18// use a smaller fan-in to keep each rollup call tight.
19const NOTE_CHUNK_SIZE = 6;
20
21// Generous upper bound — chunking handles scale, so no need to hard-cap low.
22const MAX_POSTS = 500;
23
24// ─── Sanitization ─────────────────────────────────────────────────────────────
25
26/**
27 * Strip structural prompt markers and control characters from user-supplied
28 * text before embedding in any model prompt. Prevents crafted posts from
29 * injecting fake FINDING/CHALLENGE/RESOLUTION blocks or overriding prompt
30 * instructions. Delegates to the canonical sanitizer so the marker list stays
31 * in one place — LinkedIn posts are fully attacker-controlled and feed the
32 * injectionSnippet that is prepended to every drafting agent's system prompt.
33 */
34function sanitizeForHaiku(s: string): string {
35 return sanitizePromptContent(s);
36}
37
38// ─── Utilities ────────────────────────────────────────────────────────────────
39
40function chunkArray<T>(arr: T[], size: number): T[][] {
41 const out: T[][] = [];
42 for (let i = 0; i < arr.length; i += size) out.push(arr.slice(i, i + size));
43 return out;
44}
45
46async function haiku(content: string, maxTokens: number, profileId?: string): Promise<string> {
47 const t0 = Date.now();
48 const r = await client.messages.create({
49 model: "claude-haiku-4-5-20251001",
50 max_tokens: maxTokens,
51 messages: [{ role: "user", content }],
52 });
53 costStore.record({
54 model: "claude-haiku-4-5-20251001",
55 provider: "anthropic",
56 inputTokens: r.usage.input_tokens,
57 outputTokens: r.usage.output_tokens,
58 costUsd: calcCostUsd("claude-haiku-4-5-20251001", r.usage.input_tokens, r.usage.output_tokens),
59 estimatedWh: null,
60 estimatedWatts: null,
61 durationMs: Date.now() - t0,
62 context: "tone_analysis",
63 profileId,
64 });
65 // Find the first text block rather than assuming content[0] is text — a
66 // leading thinking block or empty content would otherwise throw/return undefined.
67 const textBlock = r.content.find((b) => b.type === "text");
68 return (textBlock && textBlock.type === "text" ? textBlock.text : "").trim();
69}
70
71// ─── Leaf: analyse a single chunk of raw posts ────────────────────────────────
72
73async function analyzeChunk(posts: string[], safeName: string, profileId?: string): Promise<string> {
74 const body = posts.map((p, i) => `---POST ${i + 1}---\n${p}`).join("\n\n");
75 return haiku(
76 `Analyse the writing style of ${safeName} from these ${posts.length} posts. ` +
77 `Write a single dense paragraph (3–5 sentences) capturing: formality level, sentence structure, vocabulary register, rhetorical habits, and any distinctive phrases or transitions. ` +
78 `Be specific — quote actual words or phrases observed. ` +
79 `Plain prose only. No JSON, no headers, no bullet points.\n\n${body}`,
80 300,
81 profileId,
82 );
83}
84
85// ─── Internal node: merge a batch of style notes ─────────────────────────────
86
87async function rollupNotes(notes: string[], safeName: string, profileId?: string): Promise<string> {
88 const body = notes.map((n, i) => `[Observation ${i + 1}]\n${n}`).join("\n\n");
89 return haiku(
90 `Synthesise these ${notes.length} writing style observations for ${safeName} into one coherent paragraph. ` +
91 `Preserve specific phrases and concrete patterns. Where observations conflict, note the variation briefly. ` +
92 `Plain prose only. No JSON, no headers, no bullet points.\n\n${body}`,
93 300,
94 profileId,
95 );
96}
97
98// ─── Root: convert final prose note → structured ToneProfile ─────────────────
99
100async function buildProfile(
101 finalNote: string,
102 safeName: string,
103 meta: { sampleCount: number; sourceType: ToneProfile["sourceType"]; profileId?: string },
104): Promise<ToneProfile> {
105 const raw = await haiku(
106 `Convert this writing style description for ${safeName} into structured JSON. ` +
107 `Respond with ONLY valid JSON — no prose, no markdown fences.\n\n` +
108 `Shape:\n` +
109 `{\n` +
110 ` "formality": "formal" | "semi-formal" | "conversational",\n` +
111 ` "sentenceStyle": "long-complex" | "mixed" | "short-punchy",\n` +
112 ` "vocabulary": "technical-heavy" | "balanced" | "plain-language",\n` +
113 ` "rhetoricalStyle": "assertive" | "collaborative" | "hedging" | "analytical",\n` +
114 ` "signaturePatterns": ["<specific observation>", ...],\n` +
115 ` "injectionSnippet": "<3–5 sentence instruction for an LLM drafter to mirror this voice, starting with the lawyer's first name>"\n` +
116 `}\n\n` +
117 `signaturePatterns: 2–5 concrete observations quoting actual words or habits observed.\n` +
118 `injectionSnippet: must read as a direct instruction, e.g. "${safeName} writes with directness and economy..."\n\n` +
119 `Style description:\n${finalNote}`,
120 800,
121 meta.profileId,
122 );
123
124 const stripped = raw.replace(/```(?:json)?/gi, "").trim();
125 const s = stripped.indexOf("{");
126 const e = stripped.lastIndexOf("}");
127 if (s === -1 || e === -1 || e <= s) throw new Error("buildProfile returned invalid JSON");
128
129 const p = JSON.parse(stripped.slice(s, e + 1)) as Record<string, unknown>;
130
131 const pick = <T extends string>(val: unknown, allowed: readonly T[], fallback: T): T =>
132 (allowed as readonly unknown[]).includes(val) ? (val as T) : fallback;
133
134 return {
135 generatedAt: new Date().toISOString(),
136 sourceType: meta.sourceType,
137 sampleCount: meta.sampleCount,
138 formality: pick(p.formality, ["formal", "semi-formal", "conversational"] as const, "semi-formal"),
139 sentenceStyle: pick(p.sentenceStyle, ["long-complex", "mixed", "short-punchy"] as const, "mixed"),
140 vocabulary: pick(p.vocabulary, ["technical-heavy", "balanced", "plain-language"] as const, "balanced"),
141 rhetoricalStyle:pick(p.rhetoricalStyle,["assertive", "collaborative", "hedging", "analytical"] as const, "analytical"),
142 signaturePatterns: Array.isArray(p.signaturePatterns)
143 ? (p.signaturePatterns as unknown[]).filter((x): x is string => typeof x === "string").map((x) => x.slice(0, 200)).slice(0, 5)
144 : [],
145 injectionSnippet: typeof p.injectionSnippet === "string" && p.injectionSnippet
146 ? p.injectionSnippet.slice(0, 1000)
147 : `${safeName} — no distinctive style detected. Write in clear, professional legal English.`,
148 };
149}
150
151// ─── Recursive rollup ─────────────────────────────────────────────────────────
152
153async function recursiveRollup(
154 items: string[],
155 safeName: string,
156 level: number,
157 isRaw: boolean,
158 profileId?: string,
159): Promise<string> {
160 const chunkSize = isRaw ? POST_CHUNK_SIZE : NOTE_CHUNK_SIZE;
161 const chunks = chunkArray(items, chunkSize);
162
163 logger.debug("Tone rollup", { level, chunks: chunks.length, items: items.length, isRaw });
164
165 // Process all chunks at this level in parallel
166 const notes = await Promise.all(
167 chunks.map((c) => (isRaw ? analyzeChunk(c, safeName, profileId) : rollupNotes(c, safeName, profileId))),
168 );
169
170 // Single note — recursion is complete
171 if (notes.length === 1) return notes[0];
172
173 // Multiple notes — recurse (notes are never raw)
174 return recursiveRollup(notes, safeName, level + 1, false, profileId);
175}
176
177// ─── Public API ───────────────────────────────────────────────────────────────
178
179/**
180 * Analyse writing samples using a chunked recursive rollup.
181 *
182 * Each leaf chunk of POST_CHUNK_SIZE posts is analysed in parallel by Haiku.
183 * The resulting style notes are merged in parallel batches of NOTE_CHUNK_SIZE,
184 * recursing until a single note remains. That note is converted to a
185 * structured ToneProfile by a final Haiku call.
186 *
187 * Handles any number of posts up to MAX_POSTS with O(log n) depth and
188 * full parallelism at every level — no context-window overflow, no truncation.
189 *
190 * profileId is optional — when provided, every Haiku call is attributed to that
191 * profile in the cost log.
192 */
193export async function analyzeTone(
194 samples: string[],
195 lawyerName: string,
196 sourceType: ToneProfile["sourceType"],
197 profileId?: string,
198): Promise<ToneProfile> {
199 const safeName = sanitizeForHaiku(lawyerName.trim().slice(0, 200));
200
201 const posts = samples
202 .map((s) => sanitizeForHaiku(s.trim()))
203 .filter(Boolean)
204 .slice(0, MAX_POSTS);
205
206 if (!posts.length) throw new Error("No writing samples provided");
207
208 logger.info("Tone analysis starting", { lawyer: safeName, posts: posts.length, sourceType });
209
210 const finalNote = await recursiveRollup(posts, safeName, 0, true, profileId);
211 const profile = await buildProfile(finalNote, safeName, { sampleCount: posts.length, sourceType, profileId });
212
213 logger.info("Tone analysis complete", { lawyer: safeName, formality: profile.formality, rhetoric: profile.rhetoricalStyle });
214
215 return profile;
216}
217