feat(import): harden workbook parser boundaries
This commit is contained in:
@@ -1,32 +1,41 @@
|
||||
import { stat } from "node:fs/promises";
|
||||
import { createRequire } from "node:module";
|
||||
import path from "node:path";
|
||||
|
||||
export type WorksheetCellValue = boolean | Date | number | string | null;
|
||||
export type WorksheetMatrix = WorksheetCellValue[][];
|
||||
|
||||
type XlsxWorkbook = {
|
||||
Sheets: Record<string, unknown>;
|
||||
type ExcelJsModule = typeof import("exceljs");
|
||||
type ExcelJsWorkbook = InstanceType<ExcelJsModule["Workbook"]>;
|
||||
type ExcelJsXlsxReader = ExcelJsWorkbook["xlsx"] & {
|
||||
_processTableEntry?: (
|
||||
stream: unknown,
|
||||
model: Record<string, unknown>,
|
||||
name: string,
|
||||
) => Promise<unknown>;
|
||||
};
|
||||
|
||||
type SheetToJsonOptions = {
|
||||
header: 1;
|
||||
raw: true;
|
||||
defval: null;
|
||||
};
|
||||
|
||||
type XlsxRuntime = {
|
||||
readFile(filePath: string, options: { cellDates: true; dense: true }): XlsxWorkbook;
|
||||
utils: {
|
||||
sheet_to_json<T>(worksheet: unknown, options: SheetToJsonOptions): T[];
|
||||
};
|
||||
};
|
||||
|
||||
const require = createRequire(import.meta.url);
|
||||
const XLSX = require("xlsx") as XlsxRuntime;
|
||||
|
||||
const DISPO_WORKBOOK_EXTENSION = ".xlsx";
|
||||
export const MAX_DISPO_WORKBOOK_BYTES = 15 * 1024 * 1024;
|
||||
export const MAX_DISPO_WORKBOOK_ROWS = 10000;
|
||||
export const MAX_DISPO_WORKBOOK_COLUMNS = 1024;
|
||||
|
||||
const EXCELJS_IGNORE_WORKSHEET_NODES = ["tableParts"];
|
||||
const EXCELJS_UNSUPPORTED_TABLE_FILTER_MARKER = '"name":"dateGroupItem"';
|
||||
|
||||
let _excelJs: ExcelJsModule | null = null;
|
||||
const worksheetMatrixCache = new Map<string, Promise<WorksheetMatrix>>();
|
||||
|
||||
function normalizeExcelJsModule(module: ExcelJsModule | { default?: ExcelJsModule }): ExcelJsModule {
|
||||
return "Workbook" in module ? module : (module.default as ExcelJsModule);
|
||||
}
|
||||
|
||||
async function getExcelJS() {
|
||||
if (!_excelJs) {
|
||||
_excelJs = normalizeExcelJsModule(await import("exceljs"));
|
||||
}
|
||||
|
||||
return _excelJs;
|
||||
}
|
||||
|
||||
function trimTrailingNulls(row: WorksheetCellValue[]): WorksheetCellValue[] {
|
||||
let end = row.length;
|
||||
@@ -44,6 +53,10 @@ function trimTrailingEmptyRows(rows: WorksheetMatrix): WorksheetMatrix {
|
||||
return rows.slice(0, end);
|
||||
}
|
||||
|
||||
function cloneWorksheetMatrix(rows: WorksheetMatrix): WorksheetMatrix {
|
||||
return rows.map((row) => row.slice());
|
||||
}
|
||||
|
||||
async function validateWorkbookPath(workbookPath: string): Promise<string> {
|
||||
const resolvedPath = path.resolve(workbookPath);
|
||||
|
||||
@@ -119,31 +132,99 @@ function normalizeWorksheetCellValue(value: unknown): WorksheetCellValue {
|
||||
return String(value);
|
||||
}
|
||||
|
||||
function assertWorksheetShape(rows: WorksheetMatrix, sheetName: string, workbookPath: string): void {
|
||||
if (rows.length > MAX_DISPO_WORKBOOK_ROWS) {
|
||||
throw new Error(
|
||||
`Worksheet "${sheetName}" in "${workbookPath}" exceeds the ${MAX_DISPO_WORKBOOK_ROWS} row import limit.`,
|
||||
);
|
||||
}
|
||||
|
||||
const widestRow = rows.reduce((max, row) => Math.max(max, row.length), 0);
|
||||
if (widestRow > MAX_DISPO_WORKBOOK_COLUMNS) {
|
||||
throw new Error(
|
||||
`Worksheet "${sheetName}" in "${workbookPath}" exceeds the ${MAX_DISPO_WORKBOOK_COLUMNS} column import limit.`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function isUnsupportedExcelJsTableFilterError(error: unknown): boolean {
|
||||
return error instanceof Error && error.message.includes(EXCELJS_UNSUPPORTED_TABLE_FILTER_MARKER);
|
||||
}
|
||||
|
||||
function patchExcelJsTableCompatibility(workbook: ExcelJsWorkbook): void {
|
||||
const reader = workbook.xlsx as ExcelJsXlsxReader;
|
||||
const originalProcessTableEntry = reader._processTableEntry;
|
||||
|
||||
if (typeof originalProcessTableEntry !== "function") {
|
||||
return;
|
||||
}
|
||||
|
||||
reader._processTableEntry = async function processTableEntryWithCompatibilityFallback(
|
||||
stream,
|
||||
model,
|
||||
name,
|
||||
) {
|
||||
try {
|
||||
return await originalProcessTableEntry.call(this, stream, model, name);
|
||||
} catch (error) {
|
||||
if (isUnsupportedExcelJsTableFilterError(error)) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
throw error;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
export async function readWorksheetMatrix(
|
||||
workbookPath: string,
|
||||
sheetName: string,
|
||||
): Promise<WorksheetMatrix> {
|
||||
const resolvedPath = await validateWorkbookPath(workbookPath);
|
||||
const workbook = XLSX.readFile(resolvedPath, {
|
||||
cellDates: true,
|
||||
dense: true,
|
||||
});
|
||||
const worksheet = workbook.Sheets[sheetName];
|
||||
if (!worksheet) {
|
||||
throw new Error(`Worksheet "${sheetName}" not found in workbook "${resolvedPath}"`);
|
||||
const cacheKey = `${resolvedPath}::${sheetName}`;
|
||||
const cachedMatrix = worksheetMatrixCache.get(cacheKey);
|
||||
if (cachedMatrix) {
|
||||
return cloneWorksheetMatrix(await cachedMatrix);
|
||||
}
|
||||
|
||||
const rows = XLSX.utils.sheet_to_json<(WorksheetCellValue | null)[]>(worksheet, {
|
||||
header: 1,
|
||||
raw: true,
|
||||
defval: null,
|
||||
});
|
||||
const matrixPromise = (async () => {
|
||||
const ExcelJS = await getExcelJS();
|
||||
const workbook = new ExcelJS.Workbook();
|
||||
patchExcelJsTableCompatibility(workbook);
|
||||
await workbook.xlsx.readFile(resolvedPath, { ignoreNodes: EXCELJS_IGNORE_WORKSHEET_NODES });
|
||||
|
||||
return trimTrailingEmptyRows(
|
||||
rows.map((row: (WorksheetCellValue | null)[]) =>
|
||||
trimTrailingNulls(row.map((value: WorksheetCellValue | null) => normalizeWorksheetCellValue(value))),
|
||||
),
|
||||
);
|
||||
const worksheet = workbook.getWorksheet(sheetName);
|
||||
if (!worksheet) {
|
||||
throw new Error(`Worksheet "${sheetName}" not found in workbook "${resolvedPath}"`);
|
||||
}
|
||||
|
||||
const rows: WorksheetMatrix = [];
|
||||
for (let rowNumber = 1; rowNumber <= worksheet.rowCount; rowNumber += 1) {
|
||||
const row = worksheet.getRow(rowNumber);
|
||||
const cells: WorksheetCellValue[] = [];
|
||||
|
||||
for (let columnNumber = 1; columnNumber <= row.cellCount; columnNumber += 1) {
|
||||
cells.push(normalizeWorksheetCellValue(row.getCell(columnNumber).value));
|
||||
}
|
||||
|
||||
rows.push(trimTrailingNulls(cells));
|
||||
}
|
||||
|
||||
const normalizedRows = trimTrailingEmptyRows(rows);
|
||||
|
||||
assertWorksheetShape(normalizedRows, sheetName, resolvedPath);
|
||||
|
||||
return normalizedRows;
|
||||
})();
|
||||
|
||||
worksheetMatrixCache.set(cacheKey, matrixPromise);
|
||||
|
||||
try {
|
||||
return cloneWorksheetMatrix(await matrixPromise);
|
||||
} catch (error) {
|
||||
worksheetMatrixCache.delete(cacheKey);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
export function getCellString(
|
||||
|
||||
Reference in New Issue
Block a user