@frasma/extractify

v1.0.0

Published

14 days ago

Functional utilities to extract, transform and flow your data

0High
0Medium
0Low

typescript functional utility extractor pure-functions document-extraction ocr pdf langchain langgraph openai gpt-4o vision-api

Extractify

A TypeScript library for data extraction with functional utilities and AI-powered document processing.

Features

Functional Utilities: Pure functions for array and object manipulation
Document Extraction: AI-powered data extraction from PDFs and images using OpenAI Vision
LangGraph Workflow: Robust extraction pipeline with retry logic and validation
Type Safety: Full TypeScript support with typed extraction results
Flexible Schema: Define extraction schemas for single values and lists

Installation

npm install @frasma/extractify

Document Extraction

Extract structured data from documents (PDFs and images) using OpenAI's Vision API.

Basic Usage

import { extractFromDocument } from "@frasma/extractify";

// Define what to extract
const schema = [
  {
    key: "name",
    name: "Name",
    description: "Full name of the person",
    examples: ["John Doe", "Jane Smith"],
    type: "single",
  },
  {
    key: "email",
    name: "Email",
    description: "Email address",
    examples: ["[email protected]"],
    type: "single",
  },
];

// Extract from document
const result = await extractFromDocument({
  apiKey: process.env.OPENAI_API_KEY,
  document: "./document.pdf", // or Buffer
  schema: schema,
  options: {
    model: "gpt-4o",
    temperature: 0.1,
    includeConfidence: true,
  },
});

if (result.success) {
  console.log("Extracted data:", result.data);
  console.log("Confidence:", result.confidence?.overall);
} else {
  console.error("Extraction failed:", result.error);
}

Extracting Lists (e.g., Invoice Items)

import { extractFromDocument, createListField } from "@frasma/extractify";

// Define schema with list fields
const invoiceSchema = [
  {
    key: "invoiceNumber",
    name: "Invoice Number",
    description: "Invoice identification number",
    examples: ["INV-2024-001"],
    type: "single",
  },
  {
    key: "customer",
    name: "Customer",
    description: "Customer name",
    examples: ["Acme Corp"],
    type: "single",
  },
  {
    key: "items",
    name: "Items",
    description: "List of purchased items",
    examples: [],
    type: "list",
    itemSchema: [
      {
        key: "product",
        name: "Product",
        description: "Product name",
        examples: ["Widget", "Gadget"],
        type: "single",
      },
      {
        key: "quantity",
        name: "Quantity",
        description: "Number of units",
        examples: ["1", "5"],
        type: "single",
      },
      {
        key: "price",
        name: "Price",
        description: "Unit price",
        examples: ["99.99", "149.00"],
        type: "single",
      },
    ],
  },
  {
    key: "total",
    name: "Total",
    description: "Total amount",
    examples: ["$599.99"],
    type: "single",
  },
];

// Type your extraction result
interface InvoiceData {
  invoiceNumber: string;
  customer: string;
  items: Array<{
    product: string;
    quantity: string;
    price: string;
  }>;
  total: string;
}

const result = await extractFromDocument<InvoiceData>({
  apiKey: "sk-...",
  document: invoiceBuffer,
  schema: invoiceSchema,
});

// result.data is typed as InvoiceData

Using Helper Functions

import { createSimpleSchema, createListField } from "@frasma/extractify";

// Quick schema creation for simple fields
const simpleSchema = createSimpleSchema([
  { key: "name", description: "Person name" },
  {
    key: "birthDate",
    description: "Date of birth in DD/MM/YYYY format",
    examples: ["15/03/1990"],
  },
  { key: "address", description: "Full address" },
]);

// Create a list field
const productsField = createListField(
  "products",
  "Products",
  "List of products in the document",
  [
    { key: "name", description: "Product name" },
    { key: "sku", description: "Product SKU code" },
    { key: "price", description: "Product price" },
  ],
);

// Combine them
const fullSchema = [...simpleSchema, productsField];

Reusable Extractor

import { createDocumentExtractor } from "@frasma/extractify";

// Create a reusable extractor with default settings
const extractor = createDocumentExtractor(process.env.OPENAI_API_KEY, {
  model: "gpt-4o",
  includeConfidence: true,
  maxRetries: 3,
});

// Use it multiple times
const result1 = await extractor.extract(document1, schema);
const result2 = await extractor.extract(document2, schema);

Extraction Options

| Option | Type | Default | Description | | ------------------- | ------- | ---------- | -------------------------------------------- | | model | string | 'gpt-4o' | OpenAI model to use | | temperature | number | 0.1 | LLM temperature (lower = more deterministic) | | maxTokens | number | 4096 | Maximum tokens for response | | maxRetries | number | 3 | Retry attempts on API errors | | includeConfidence | boolean | true | Include confidence scores | | imageQuality | number | 85 | JPEG quality for image compression | | maxImageDimension | number | 2048 | Max pixel dimension for images |

Extraction Result

interface ExtractionResult<T> {
  data: T; // Extracted data
  success: boolean; // Whether extraction succeeded
  error?: string; // Error message if failed
  confidence?: {
    overall: number; // Overall confidence (0-1)
    fields: Record<string, number>; // Per-field confidence
  };
  metadata: {
    pagesProcessed: number; // Number of pages processed
    processingTimeMs: number; // Processing time in ms
    modelUsed: string; // Model used
    tokensUsed?: {
      // Token usage (if available)
      prompt: number;
      completion: number;
      total: number;
    };
    documentFormat: "pdf" | "image";
  };
}

Functional Utilities

Core Functions

import { pipe, compose, curry } from "@frasma/extractify";

// Pipe: left-to-right composition
const result = pipe(
  [1, 2, 3, 4, 5],
  filter((x) => x > 2),
  map((x) => x * 2),
);
// result: [6, 8, 10]

// Compose: right-to-left composition
const transform = compose(
  (x: number[]) => x.map((n) => n * 2),
  (x: number[]) => x.filter((n) => n > 2),
);

// Curry: convert function to curried version
const add = curry((a: number, b: number) => a + b);
const add5 = add(5);
add5(3); // 8

Array Functions

import { map, filter, reduce, flatMap } from "@frasma/extractify";

// All functions are curried and composable
const double = map((x: number) => x * 2);
const evens = filter((x: number) => x % 2 === 0);

const result = pipe([1, 2, 3, 4], evens, double);
// result: [4, 8]

Object Functions

import { pick, omit, merge } from "@frasma/extractify";

const user = { name: "John", email: "[email protected]", password: "secret" };

// Pick specific keys
pick(["name", "email"])(user);
// { name: 'John', email: '[email protected]' }

// Omit specific keys
omit(["password"])(user);
// { name: 'John', email: '[email protected]' }

// Deep merge objects
merge({ a: 1, b: { c: 2 } })({ b: { d: 3 }, e: 4 });
// { a: 1, b: { c: 2, d: 3 }, e: 4 }

Development

# Install dependencies
npm install

# Run tests
npm test

# Run tests in watch mode
npm run test:watch

# Build
npm run build

# Type check
npm run type-check

# Lint
npm run lint

Requirements

Node.js >= 18.0.0
OpenAI API key (for document extraction)