feat(import): harden workbook parser boundaries

This commit is contained in:
2026-03-31 22:48:30 +02:00
parent 3e8b1702bc
commit db50e2e555
20 changed files with 936 additions and 174 deletions
+83
View File
@@ -0,0 +1,83 @@
import { describe, expect, it } from "vitest";
import {
MAX_BROWSER_SPREADSHEET_BYTES,
assertSpreadsheetFile,
parseSpreadsheet,
} from "./excel.js";
async function createWorkbookFile(
rows: unknown[][],
fileName = "spreadsheet.xlsx",
): Promise<File> {
const ExcelJS = await import("exceljs");
const workbook = new ExcelJS.Workbook();
const worksheet = workbook.addWorksheet("Sheet1");
for (const row of rows) {
worksheet.addRow(row);
}
const buffer = await workbook.xlsx.writeBuffer();
return new File([buffer], fileName, {
type: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
});
}
describe("excel import helpers", () => {
it("parses csv files with quoted values and skips blank rows", async () => {
const file = new File(
['name,role\n"Alice, A.",Engineer\n\nBob,Producer\n'],
"people.csv",
{ type: "text/csv" },
);
await expect(parseSpreadsheet(file)).resolves.toEqual([
{ name: "Alice, A.", role: "Engineer" },
{ name: "Bob", role: "Producer" },
]);
});
it("parses xlsx files and normalizes date cells to ISO strings", async () => {
const file = await createWorkbookFile([
["name", "startDate", "active"],
["Alice", new Date("2026-03-30T09:15:00.000Z"), true],
]);
await expect(parseSpreadsheet(file)).resolves.toEqual([
{
name: "Alice",
startDate: "2026-03-30T09:15:00.000Z",
active: "true",
},
]);
});
it("rejects duplicate headers in xlsx imports", async () => {
const file = await createWorkbookFile([
["Name", "name"],
["Alice", "Producer"],
]);
await expect(parseSpreadsheet(file)).rejects.toThrow('duplicate header "name"');
});
it("rejects legacy .xls uploads before parsing", () => {
const file = new File(["legacy"], "legacy.xls", {
type: "application/vnd.ms-excel",
});
expect(() => assertSpreadsheetFile(file)).toThrow(
"Legacy .xls files are not supported.",
);
});
it("rejects oversized spreadsheet uploads before parsing", () => {
const file = new File([Buffer.alloc(MAX_BROWSER_SPREADSHEET_BYTES + 1)], "oversized.xlsx", {
type: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
});
expect(() => assertSpreadsheetFile(file)).toThrow(
`The selected file exceeds the ${MAX_BROWSER_SPREADSHEET_BYTES} byte limit`,
);
});
});
+44 -3
View File
@@ -3,6 +3,8 @@ const CSV_EXTENSION = ".csv";
const XLS_EXTENSION = ".xls";
export const MAX_BROWSER_SPREADSHEET_BYTES = 10 * 1024 * 1024;
export const MAX_BROWSER_SPREADSHEET_ROWS = 5000;
export const MAX_BROWSER_SPREADSHEET_COLUMNS = 200;
type ExcelJsModule = typeof import("exceljs");
let _excelJs: ExcelJsModule | null = null;
@@ -117,8 +119,47 @@ function parseCsvMatrix(input: string): string[][] {
return rows;
}
function matrixToObjects(rows: string[][]): Record<string, string>[] {
export function assertTabularMatrixWithinLimits(rows: string[][], contextLabel: string): void {
if (rows.length > MAX_BROWSER_SPREADSHEET_ROWS + 1) {
throw new Error(
`The selected file exceeds the ${MAX_BROWSER_SPREADSHEET_ROWS} row limit for ${contextLabel}.`,
);
}
const widestRow = rows.reduce((max, row) => Math.max(max, row.length), 0);
if (widestRow > MAX_BROWSER_SPREADSHEET_COLUMNS) {
throw new Error(
`The selected file exceeds the ${MAX_BROWSER_SPREADSHEET_COLUMNS} column limit for ${contextLabel}.`,
);
}
}
export function assertHeaderRow(headers: string[], contextLabel: string): void {
if (headers.length === 0) {
return;
}
const blankHeaderIndex = headers.findIndex((header) => header.length === 0);
if (blankHeaderIndex >= 0) {
throw new Error(
`The selected file contains an empty header cell in column ${blankHeaderIndex + 1} and cannot be used for ${contextLabel}.`,
);
}
const seen = new Set<string>();
for (const header of headers) {
const normalized = header.toLowerCase();
if (seen.has(normalized)) {
throw new Error(`The selected file contains duplicate header "${header}" and cannot be used for ${contextLabel}.`);
}
seen.add(normalized);
}
}
function matrixToObjects(rows: string[][], contextLabel: string): Record<string, string>[] {
assertTabularMatrixWithinLimits(rows, contextLabel);
const headers = (rows[0] ?? []).map((header) => header.trim());
assertHeaderRow(headers, contextLabel);
if (headers.length === 0) {
return [];
}
@@ -203,7 +244,7 @@ async function parseXlsxSpreadsheet(file: File): Promise<Record<string, string>[
rows.push(cells);
}
return matrixToObjects(rows);
return matrixToObjects(rows, "spreadsheet import");
}
/**
@@ -214,7 +255,7 @@ export async function parseSpreadsheet(file: File): Promise<Record<string, strin
assertSpreadsheetFile(file);
if (getFileExtension(file.name) === CSV_EXTENSION) {
return matrixToObjects(parseCsvMatrix(await file.text()));
return matrixToObjects(parseCsvMatrix(await file.text()), "spreadsheet import");
}
return parseXlsxSpreadsheet(file);
+106
View File
@@ -0,0 +1,106 @@
import { describe, expect, it } from "vitest";
import { matchRoleName, parseSkillMatrixWorkbook } from "./skillMatrixParser.js";
async function createWorkbookBuffer(
sheets: Array<{ name: string; rows: unknown[][] }>,
): Promise<ArrayBuffer> {
const ExcelJS = await import("exceljs");
const workbook = new ExcelJS.Workbook();
for (const sheet of sheets) {
const worksheet = workbook.addWorksheet(sheet.name);
for (const row of sheet.rows) {
worksheet.addRow(row);
}
}
const buffer = await workbook.xlsx.writeBuffer();
const bytes = buffer instanceof Uint8Array ? buffer : new Uint8Array(buffer);
return bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength);
}
describe("skill matrix parser", () => {
it("extracts employee info and merges skills by highest proficiency", async () => {
const workbook = await createWorkbookBuffer([
{
name: "Employee Information",
rows: [
["item", "property"],
["Full Name", "Alex Artist"],
["Area of Expertise", "Compositing"],
["Years of Experience", "7.4"],
["Portfolio URL", "https://portfolio.example/alex"],
],
},
{
name: "Software Skills",
rows: [
["category", "item", "property", "main skillset"],
["Software", "Nuke", "2", "1"],
["Software", "Photoshop", "0", ""],
],
},
{
name: "Technical Skillset",
rows: [
["category", "item", "property", "main skillset"],
["Pipeline", "Nuke", "4", ""],
["Pipeline", "Python", "3", "2"],
],
},
]);
await expect(parseSkillMatrixWorkbook(workbook)).resolves.toEqual({
employeeInfo: {
displayName: "Alex Artist",
areaOfExpertise: "Compositing",
yearsOfExperience: 7,
portfolioUrl: "https://portfolio.example/alex",
},
skills: expect.arrayContaining([
{
skill: "Nuke",
category: "Pipeline",
proficiency: 5,
},
{
skill: "Python",
category: "Pipeline",
proficiency: 4,
isMainSkill: true,
},
]),
});
});
it("rejects duplicate headers in skill sheets", async () => {
const workbook = await createWorkbookBuffer([
{
name: "Employee Information",
rows: [
["item", "property"],
["Full Name", "Alex Artist"],
],
},
{
name: "Software Skills",
rows: [
["item", "item", "property"],
["Nuke", "Duplicate", "2"],
],
},
{
name: "Technical Skillset",
rows: [["category", "item", "property"]],
},
]);
await expect(parseSkillMatrixWorkbook(workbook)).rejects.toThrow('duplicate header "item"');
});
it("matches role names by exact and partial matches", () => {
expect(matchRoleName("Compositing", ["Producer", "Compositing"])).toBe("Compositing");
expect(matchRoleName("Senior Producer", ["Producer", "Lighting"])).toBe("Producer");
expect(matchRoleName("Rigging", ["Producer", "Lighting"])).toBeNull();
});
});
+3
View File
@@ -1,4 +1,5 @@
import type { SkillEntry } from "@capakraken/shared";
import { assertHeaderRow, assertTabularMatrixWithinLimits } from "./excel.js";
type ExcelJsModule = typeof import("exceljs");
@@ -80,7 +81,9 @@ function worksheetToRowObjects(
rows.push(cells);
}
assertTabularMatrixWithinLimits(rows, "skill matrix import");
const headers = (rows[0] ?? []).map((header) => header.trim());
assertHeaderRow(headers, "skill matrix import");
if (headers.length === 0) {
return [];
}