> ## Documentation Index
> Fetch the complete documentation index at: https://r2r-patch-fix-ingestion.mintlify.site/llms.txt
> Use this file to discover all available pages before exploring further.

# Ingestion

> Ingesting files with R2R.

## Document Ingestion and Management

### Ingest Files

Ingest files or directories into your R2R system:

```javascript
const files = [
  { path: 'path/to/file1.txt', name: 'file1.txt' },
  { path: 'path/to/file2.txt', name: 'file2.txt' }
];
const metadatas = [{ key1: 'value1' }, { key2: 'value2' }];

const ingestResponse = await client.ingestFiles(files, {
  metadatas,
  user_ids: ['user-id-1', 'user-id-2'],
});
```

<AccordionGroup>
  <Accordion title="Response">
    <ResponseField name="response" type="object">
      The response from the R2R system after ingesting the files.

      ```bash
      [{'message': 'Ingestion task queued successfully.', 'task_id': '6e27dfca-606d-422d-b73f-2d9e138661b4', 'document_id': 'c3291abf-8a4e-5d9d-80fd-232ef6fd8526'}, ...]
      ```
    </ResponseField>
  </Accordion>
</AccordionGroup>

<ParamField path="files" type="Array<string | File | { path: string; name: string }>" required>
  An array of file paths, File objects, or objects with path and name properties to ingest.
</ParamField>

<ParamField path="options" type="object">
  <ParamField path="metadatas" type="Record<string, any>">
    An optional array of metadata objects corresponding to each file.
  </ParamField>

  <ParamField path="document_ids" type="Array<string>">
    An optional array of document IDs to assign to the ingested files.
  </ParamField>

  <ParamField path="user_ids" type="Array<string | null>">
    An optional array of user IDs associated with the ingested files.
  </ParamField>
</ParamField>

<ParamField path="ingestion_config" type="Optional[Union[dict, ChunkingConfig]]">
  The ingestion config override parameter enables developers to customize their R2R chunking strategy at runtime.

  <Expandable title="properties">
    <ParamField path="provider" type="str" default="unstructured_local">
      Which chunking provider to use. Options are "r2r", "unstructured\_local", or "unstructured\_api".
    </ParamField>

    <ParamField path="method" type="str" default="by_title">
      Which chunking method to apply. Options are "by\_title", "basic", "recursive", or "character".
    </ParamField>

    <ParamField path="chunk_size" type="int" default="512">
      The average size of chunks, in tokens.
    </ParamField>

    <ParamField path="chunk_overlap" type="int" default="20">
      The default overlap between chunks.
    </ParamField>

    <ParamField path="max_chunk_size" type="Optional[int]" default="None">
      Sets a maximum size on output chunks.
    </ParamField>

    <ParamField path="combine_under_n_chars" type="Optional[int]">
      Combine chunks smaller than this number of characters.
    </ParamField>

    <ParamField path="max_characters" type="Optional[int]">
      Maximum number of characters per chunk.
    </ParamField>

    <ParamField path="coordinates" type="bool" default="False">
      Whether to include coordinates in the output.
    </ParamField>

    <ParamField path="encoding" type="Optional[str]">
      Encoding to use for text files.
    </ParamField>

    <ParamField path="extract_image_block_types" type="Optional[list[str]]">
      Types of image blocks to extract.
    </ParamField>

    <ParamField path="gz_uncompressed_content_type" type="Optional[str]">
      Content type for uncompressed gzip files.
    </ParamField>

    <ParamField path="hi_res_model_name" type="Optional[str]">
      Name of the high-resolution model to use.
    </ParamField>

    <ParamField path="include_orig_elements" type="Optional[bool]" default="False">
      Whether to include original elements in the output.
    </ParamField>

    <ParamField path="include_page_breaks" type="bool">
      Whether to include page breaks in the output.
    </ParamField>

    <ParamField path="languages" type="Optional[list[str]]">
      List of languages to consider for text processing.
    </ParamField>

    <ParamField path="multipage_sections" type="bool" default="True">
      Whether to allow sections to span multiple pages.
    </ParamField>

    <ParamField path="new_after_n_chars" type="Optional[int]">
      Start a new chunk after this many characters.
    </ParamField>

    <ParamField path="ocr_languages" type="Optional[list[str]]">
      Languages to use for OCR.
    </ParamField>

    <ParamField path="output_format" type="str" default="application/json">
      Format of the output.
    </ParamField>

    <ParamField path="overlap" type="int" default="0">
      Number of characters to overlap between chunks.
    </ParamField>

    <ParamField path="overlap_all" type="bool" default="False">
      Whether to overlap all chunks.
    </ParamField>

    <ParamField path="pdf_infer_table_structure" type="bool" default="True">
      Whether to infer table structure in PDFs.
    </ParamField>

    <ParamField path="similarity_threshold" type="Optional[float]">
      Threshold for considering chunks similar.
    </ParamField>

    <ParamField path="skip_infer_table_types" type="Optional[list[str]]">
      Types of tables to skip inferring.
    </ParamField>

    <ParamField path="split_pdf_concurrency_level" type="int" default="5">
      Concurrency level for splitting PDFs.
    </ParamField>

    <ParamField path="split_pdf_page" type="bool" default="True">
      Whether to split PDFs by page.
    </ParamField>

    <ParamField path="starting_page_number" type="Optional[int]">
      Page number to start processing from.
    </ParamField>

    <ParamField path="strategy" type="str" default="auto">
      Strategy for processing. Options are "auto", "fast", or "hi\_res".
    </ParamField>

    <ParamField path="chunking_strategy" type="Optional[str]" default="by_title">
      Strategy for chunking. Options are "by\_title" or "basic".
    </ParamField>

    <ParamField path="unique_element_ids" type="bool" default="False">
      Whether to generate unique IDs for elements.
    </ParamField>

    <ParamField path="xml_keep_tags" type="bool" default="False">
      Whether to keep XML tags in the output.
    </ParamField>
  </Expandable>
</ParamField>

### Update Files

Update existing documents:

```javascript
const files = [
  { path: '/path/to/updated_file1.txt', name: 'updated_file1.txt' }
];
const document_ids = ['document-id-1'];
const updateResponse = await client.updateFiles(files, {
  document_ids,
  metadatas: [{ key: 'updated_value' }] // to overwrite the existing metadata
});
```

<AccordionGroup>
  <Accordion title="Response">
    <ResponseField name="response" type="object">
      The response from the R2R system after updating the files.

      ```bash
        {'results': {'processed_documents': [{'id': '9f375ce9-efe9-5b57-8bf2-a63dee5f3621', 'title': 'aristotle_v2.txt'}], 'failed_documents': [], 'skipped_documents': []}}
      ```
    </ResponseField>
  </Accordion>
</AccordionGroup>

<ParamField path="files" type="Array<File | { path: string; name: string }>" required>
  An array of File objects or objects with path and name properties to update.
</ParamField>

<ParamField path="options" type="object" required>
  <ParamField path="document_ids" type="Array<string>" required>
    An array of document IDs corresponding to the files being updated.
  </ParamField>

  <ParamField path="metadatas" type="Array<Record<string, any>>">
    An optional array of metadata objects for the updated files.
  </ParamField>

  <ParamField path="ingestion_config" type="Record<string, any>">
    The ingestion config override parameter enables developers to customize their R2R chunking strategy at runtime.

    <Expandable title="properties">
      <ParamField path="provider" type="str" default="r2r">
        Which chunking provider to use, `r2r` or `unstructured`. Selecting `unstructured` is generally recommended when parsing with `unstructured` or `unstructured_api`.
      </ParamField>

      <ParamField path="method" type="str" default="recursive">
        Which chunking method to apply? When using unstructured, `by_title` or `basic` are supported.
      </ParamField>

      <ParamField path="chunk_size" type="int" default="512">
        The average size of chunks, in tokens.
      </ParamField>

      <ParamField path="chunk_overlap" type="int" default="20">
        The default overlap between chunks.
      </ParamField>

      <ParamField path="max_chunk_size" type="Optional[int]" default="None">
        Sets a maximum size on output chunks.
      </ParamField>
    </Expandable>
  </ParamField>
</ParamField>

### Documents Overview

Retrieve high-level document information, restricted to user files, except when called by a superuser where it will then return results from over all users:

```javascript
const documentsOverview = await client.documentsOverview();
```

<AccordionGroup>
  <Accordion title="Response">
    <ResponseField name="response" type="Array<object>">
      An array of objects containing document information.

      ```bash
      [
        {
          'document_id': '9fbe403b-c11c-5aae-8ade-ef22980c3ad1',
          'version': 'v1',
          'size_in_bytes': 73353,
          'metadata': {},
          'status': 'success',
          'user_id': '2acb499e-8428-543b-bd85-0d9098718220',
          'title': 'aristotle.txt',
          'created_at': '2024-07-21T20:09:14.218741Z',
          'updated_at': '2024-07-21T20:09:14.218741Z',
          'metadata': {'x': 'y'}
        },
        ...
      ]
      ```
    </ResponseField>
  </Accordion>
</AccordionGroup>

<ParamField path="document_ids" type="Array<string>">
  An optional array of document IDs to filter the overview.
</ParamField>

### Document Chunks

Fetch chunks for a particular document:

```javascript
const documentId = '9fbe403b-c11c-5aae-8ade-ef22980c3ad1';
const chunks = await client.documentChunks(documentId);
```

<AccordionGroup>
  <Accordion title="Response">
    <ResponseField name="response" type="Array<object>">
      An array of objects containing chunk information.

      ```bash
      [{
        'text': 'Aristotle[A] (Greek: Ἀριστοτέλης Aristotélēs, pronounced [aristotélɛːs]; 384–322 BC) was an Ancient Greek philosopher and polymath...',
        'user_id': '2acb499e-8428-543b-bd85-0d9098718220',
        'document_id': '9fbe403b-c11c-5aae-8ade-ef22980c3ad1',
        'extraction_id': 'aeba6400-1bd0-5ee9-8925-04732d675434',
        'fragment_id': 'f48bcdad-4155-52a4-8c9d-8ba06e996ba3'
        'metadata': {'title': 'aristotle.txt', 'version': 'v0', 'chunk_order': 0}}
      },
      ...]
      ```
    </ResponseField>
  </Accordion>
</AccordionGroup>

<ParamField path="document_id" type="string" required>
  The ID of the document to retrieve chunks for.
</ParamField>

### Delete Documents

Delete a document by its ID:

```javascript
const deleteResponse = await client.delete({ document_id: "91662726-7271-51a5-a0ae-34818509e1fd" });
```

<AccordionGroup>
  <Accordion title="Response">
    <ResponseField name="response" type="object">
      The response from the R2R system after successfully deleting the documents.

      ```bash
      {'results': {}}
      ```
    </ResponseField>
  </Accordion>
</AccordionGroup>

<ParamField path="filters" type="{ [key: string]: string | string[] }" required>
  A list of logical filters to perform over input documents fields which identifies the unique set of documents to delete (e.g., `{"document_id": {"$eq": "9fbe403b-c11c-5aae-8ade-ef22980c3ad1"}}`). Logical operations might include variables such as `"user_id"` or  `"title"` and filters like `neq`, `gte`, etc.
</ParamField>
