Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,14 @@ The dataframe libraries currently supported are:
- [Polars](https://pola.rs)

The library offers various custom [Converters](https://docs.haystack.deepset.ai/docs/converters) components to transform dataframes into Haystack [`Document`](https://docs.haystack.deepset.ai/docs/data-classes#document) objects:
- `DataFrameFileToDocument` is a main generic converter that reads files using a dataframe backend and converts them into `Document` objects.
- `FileToPandasDataFrame` and `FileToPolarsDataFrame` read files and convert them into dataframes.
- `PandasDataFrameConverter` or `PolarsDataFrameConverter` convert data stored in dataframes into Haystack `Document`objects.

`dataframes-haystack` supports reading files in various formats:
- _csv_, _json_, _parquet_, _excel_, _html_, _xml_, _orc_, _pickle_, _fixed-width format_ for `pandas`. See the [pandas documentation](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html) for more details.
- _csv_, _json_, _parquet_, _excel_, _avro_, _delta_, _ipc_ for `polars`. See the [polars documentation](https://docs.pola.rs/api/python/stable/reference/io.html) for more details.

## 🛠️ Installation

```sh
Expand All @@ -40,8 +45,31 @@ pip install "dataframes-haystack[polars]"
> [!TIP]
> See the [Example Notebooks](./notebooks) for complete examples.

## DataFrameFileToDocument

[Complete example](https://github.com/EdAbati/dataframes-haystack/blob/main/notebooks/dataframe-file-to-doc-example.ipynb)

You can leverage both `pandas` and `polars` backends (thanks to [`narwhals`](https://github.com/narwhals-dev/narwhals)) to read your data!

```python
from dataframes_haystack.components.converters import DataFrameFileToDocument

converter = DataFrameFileToDocument(content_column="text_str")
documents = converter.run(files=["file1.csv", "file2.csv"])
```

```python
>>> documents
{'documents': [
Document(id=0, content: 'Hello world', meta: {}),
Document(id=1, content: 'Hello everyone', meta: {})
]}
```

### Pandas

[Complete example](https://github.com/EdAbati/dataframes-haystack/blob/main/notebooks/pandas-example.ipynb)

#### FileToPandasDataFrame

```python
Expand Down Expand Up @@ -87,6 +115,8 @@ Result:

### Polars

[Complete example](https://github.com/EdAbati/dataframes-haystack/blob/main/notebooks/polars-example.ipynb)

#### FileToPolarsDataFrame

```python
Expand Down
467 changes: 467 additions & 0 deletions notebooks/dataframe-file-to-doc-example.ipynb

Large diffs are not rendered by default.

79 changes: 43 additions & 36 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,16 @@ description = "Haystack custom components for your favourite dataframe library."
readme = "README.md"
requires-python = ">=3.8"
license = { file = "LICENSE" }
keywords = ["nlp", "machine-learning", "ai", "haystack", "pandas", "dataframe", "polars", "llm"]
keywords = [
"nlp",
"machine-learning",
"ai",
"haystack",
"pandas",
"dataframe",
"polars",
"llm",
]
authors = [{ name = "Edoardo Abati" }]
classifiers = [
"License :: OSI Approved :: MIT License",
Expand All @@ -26,6 +35,7 @@ classifiers = [

dependencies = [
"haystack-ai>=2.0.0",
"narwhals>=1.1.0",
"typing_extensions",
]
[project.optional-dependencies]
Expand All @@ -42,7 +52,7 @@ path = "src/dataframes_haystack/__about__.py"

# Default environment
[tool.hatch.envs.default]
installer="uv"
installer = "uv"
dependencies = [
"coverage[toml]>=6.5",
"pytest",
Expand Down Expand Up @@ -88,8 +98,17 @@ check = "mypy --install-types --non-interactive {args:src/dataframes_haystack te
detached = true
dependencies = ["black>=24.3.0", "nbqa>=1.8.5", "ruff>=0.3.4"]
[tool.hatch.envs.lint.scripts]
style = ["ruff check {args:.}", "black --check --diff {args:.}", "nbqa black --check --diff notebooks/*"]
fmt = ["black {args:.}", "ruff check --fix {args:.}", "nbqa black notebooks/*", "style"]
style = [
"ruff check {args:.}",
"black --check --diff {args:.}",
"nbqa black --check --diff notebooks/*",
]
fmt = [
"black {args:.}",
"ruff check --fix {args:.}",
"nbqa black notebooks/*",
"style",
]

[tool.black]
target-version = ["py38"]
Expand All @@ -102,60 +121,45 @@ line-length = 120
extend-include = ["*.ipynb"]

[tool.ruff.lint]
select = [
"A",
"ARG",
"B",
"C",
"DTZ",
"E",
"EM",
"F",
"I",
"ICN",
"ISC",
"N",
"PLC",
"PLE",
"PLR",
"PLW",
"Q",
"RUF",
"S",
"T",
"TID",
"UP",
"W",
"YTT",
]
select = ["ALL"]
ignore = [
# Allow non-abstract empty methods in abstract base classes
"B027",
# No required doctstring for modules, packages
"D100",
"D104",
# No future annotations
"FA100",
# Ignore checks for possible passwords
"S105",
"S106",
"S107",
# Ignore complexity
"C901",
# Generic veriable name df is ok
"PD901",
"PLR0911",
"PLR0912",
"PLR0913",
"PLR0915",
]
unfixable = [
# Don't touch unused imports
"F401",
]

[tool.ruff.lint.isort]
known-first-party = ["dataframes_haystack"]

[tool.ruff.lint.flake8-tidy-imports]
ban-relative-imports = "all"

[tool.ruff.lint.pydocstyle]
convention = "google"

[tool.ruff.format]
docstring-code-format = true

[tool.ruff.lint.per-file-ignores]
# Tests can use magic values, assertions, and relative imports
"tests/**/*" = ["PLR2004", "S101", "TID252"]
"tests/*" = ["PLR2004", "S101", "TID252", "D100", "D103"]
"notebooks/*" = ["PTH123", "SIM115"]


# Test coverage
Expand All @@ -168,7 +172,10 @@ omit = [
]

[tool.coverage.paths]
dataframes_haystack = ["src/dataframes_haystack", "*/dataframes-haystack/src/dataframes_haystack"]
dataframes_haystack = [
"src/dataframes_haystack",
"*/dataframes-haystack/src/dataframes_haystack",
]
tests = ["tests", "*/dataframes-haystack/tests"]

[tool.coverage.report]
Expand Down
5 changes: 5 additions & 0 deletions src/dataframes_haystack/components/converters/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from dataframes_haystack.components.converters._common import DataFrameFileToDocument

__all__ = [
"DataFrameFileToDocument",
]
112 changes: 112 additions & 0 deletions src/dataframes_haystack/components/converters/_common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import logging
from functools import partial
from typing import Any, Dict, List, Literal, Optional, Union

import narwhals.stable.v1 as nw
from haystack import Document, component

from dataframes_haystack.components.converters._utils import (
FileFormat,
ReaderFunc,
frame_to_documents,
get_pandas_readers_map,
get_polars_readers_map,
read_with_select,
)

logger = logging.getLogger(__name__)

Backends = Literal["pandas", "polars"]


@component
class DataFrameFileToDocument:
"""Reads files and converts their data in Documents.

Usage example:
```python
from dataframes_haystack.components.converters import DataFrameFileToDocument

converter = DataFrameFileToDocument(content_column="text_str")
results = converter.run(files=["file1.csv", "file2.csv"])
documents = results["documents"]
print(documents[0].content)
```
"""

def __init__(
self,
content_column: str,
meta_columns: Union[List[str], None] = None,
index_column: Union[str, None] = None,
file_format: FileFormat = "csv",
read_kwargs: Optional[Dict[str, Any]] = None,
backend: Backends = "polars",
) -> None:
"""Create a DataFrameFileToDocument component.

Args:
content_column: The name of the DataFrame column that contains the text content.
meta_columns: Optional list of names of the DataFrame columns that contain metadata.
index_column: The name of the DataFrame column that contains the index.
file_format: The format of the files to read.
read_kwargs: Optional keyword arguments to pass to the file reader function.
backend: The backend to use for reading the files.
"""
self.content_column = content_column
self.meta_columns = meta_columns or []
self.index_column = index_column
self.file_format = file_format
self.read_kwargs = read_kwargs or {}
self.backend = backend
if self.backend not in ["pandas", "polars"]:
msg = f"Unsupported backend: {self.backend}"
raise ValueError(msg)
self._reader_function = self._get_reader_function()

def _get_reader_function(self) -> ReaderFunc:
file_format_mapping = get_pandas_readers_map() if self.backend == "pandas" else get_polars_readers_map()
reader_function = file_format_mapping.get(self.file_format)
if reader_function:
return reader_function
msg = f"Unsupported file format for {self.backend} backend: {self.file_format}"
raise ValueError(msg)

def _run_read(self, file_paths: List[str]) -> nw.DataFrame:
selected_columns = [self.index_column, self.content_column, *self.meta_columns]
selected_columns = [col for col in selected_columns if col is not None]
read_func = partial(self._reader_function, **self.read_kwargs)
df_list = [read_with_select(read_func, file_path=path, columns_subset=selected_columns) for path in file_paths]
return nw.concat(df_list, how="vertical")

@component.output_types(documents=List[Document])
def run(
self,
file_paths: List[str],
meta: Union[Dict[str, Any], List[Dict[str, Any]], None] = None,
) -> Dict[str, List[Document]]:
"""Reads files and converts their data in Documents.

Args:
file_paths: List of file paths to read.
meta:
Optional metadata to attach to the Documents.
This value can be either a dictionary or a list of dictionaries.
If it's a dictionary, its content is added to the metadata of all produced Documents.
If it's a list, the length of the list must match the number of rows in the DataFrame,
because the two lists will be zipped.

Returns:
A dictionary with the following keys:
- `documents`: Created Documents
"""
df = self._run_read(file_paths)
documents = frame_to_documents(
df,
content_column=self.content_column,
meta_columns=self.meta_columns,
index_column=self.index_column,
extra_metadata=meta,
)

return {"documents": documents}
Loading