EdAbati · EdAbati · Oct 24, 2024 · Jul 21, 2024 · Jul 25, 2024 · Jul 27, 2024
diff --git a/README.md b/README.md
@@ -22,9 +22,14 @@ The dataframe libraries currently supported are:
 - [Polars](https://pola.rs)
 
 The library offers various custom [Converters](https://docs.haystack.deepset.ai/docs/converters) components to transform dataframes into Haystack [`Document`](https://docs.haystack.deepset.ai/docs/data-classes#document) objects:
+- `DataFrameFileToDocument` is a main generic converter that reads files using a dataframe backend and converts them into `Document` objects.
 - `FileToPandasDataFrame` and `FileToPolarsDataFrame` read files and convert them into dataframes.
 - `PandasDataFrameConverter` or `PolarsDataFrameConverter` convert data stored in dataframes into Haystack `Document`objects.
 
+`dataframes-haystack` supports reading files in various formats:
+- _csv_, _json_, _parquet_, _excel_, _html_, _xml_, _orc_, _pickle_, _fixed-width format_ for `pandas`. See the [pandas documentation](https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html) for more details.
+- _csv_, _json_, _parquet_, _excel_, _avro_, _delta_, _ipc_ for `polars`. See the [polars documentation](https://docs.pola.rs/api/python/stable/reference/io.html) for more details.
+
 ## 🛠️ Installation
 
 ```sh
@@ -40,8 +45,31 @@ pip install "dataframes-haystack[polars]"
 > [!TIP]
 > See the [Example Notebooks](./notebooks) for complete examples.
 
+## DataFrameFileToDocument
+
+[Complete example](https://github.com/EdAbati/dataframes-haystack/blob/main/notebooks/dataframe-file-to-doc-example.ipynb)
+
+You can leverage both `pandas` and `polars` backends (thanks to [`narwhals`](https://github.com/narwhals-dev/narwhals)) to read your data!
+
+```python
+from dataframes_haystack.components.converters import DataFrameFileToDocument
+
+converter = DataFrameFileToDocument(content_column="text_str")
+documents = converter.run(files=["file1.csv", "file2.csv"])
+```
+
+```python
+>>> documents
+{'documents': [
+    Document(id=0, content: 'Hello world', meta: {}),
+    Document(id=1, content: 'Hello everyone', meta: {})
+]}
+```
+
 ### Pandas
 
+[Complete example](https://github.com/EdAbati/dataframes-haystack/blob/main/notebooks/pandas-example.ipynb)
+
 #### FileToPandasDataFrame
 
 ```python
@@ -87,6 +115,8 @@ Result:
 
 ### Polars
 
+[Complete example](https://github.com/EdAbati/dataframes-haystack/blob/main/notebooks/polars-example.ipynb)
+
 #### FileToPolarsDataFrame
 
 ```python

diff --git a/notebooks/dataframe-file-to-doc-example.ipynb b/notebooks/dataframe-file-to-doc-example.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,16 @@ description = "Haystack custom components for your favourite dataframe library."
 readme = "README.md"
 requires-python = ">=3.8"
 license = { file = "LICENSE" }
-keywords = ["nlp", "machine-learning", "ai", "haystack", "pandas", "dataframe", "polars", "llm"]
+keywords = [
+  "nlp",
+  "machine-learning",
+  "ai",
+  "haystack",
+  "pandas",
+  "dataframe",
+  "polars",
+  "llm",
+]
 authors = [{ name = "Edoardo Abati" }]
 classifiers = [
   "License :: OSI Approved :: MIT License",
@@ -26,6 +35,7 @@ classifiers = [
 
 dependencies = [
   "haystack-ai>=2.0.0",
+  "narwhals>=1.1.0",
   "typing_extensions",
 ]
 [project.optional-dependencies]
@@ -42,7 +52,7 @@ path = "src/dataframes_haystack/__about__.py"
 
 # Default environment
 [tool.hatch.envs.default]
-installer="uv"
+installer = "uv"
 dependencies = [
   "coverage[toml]>=6.5",
   "pytest",
@@ -88,8 +98,17 @@ check = "mypy --install-types --non-interactive {args:src/dataframes_haystack te
 detached = true
 dependencies = ["black>=24.3.0", "nbqa>=1.8.5", "ruff>=0.3.4"]
 [tool.hatch.envs.lint.scripts]
-style = ["ruff check {args:.}", "black --check --diff {args:.}", "nbqa black --check --diff notebooks/*"]
-fmt = ["black {args:.}", "ruff check --fix {args:.}", "nbqa black notebooks/*", "style"]
+style = [
+  "ruff check {args:.}",
+  "black --check --diff {args:.}",
+  "nbqa black --check --diff notebooks/*",
+]
+fmt = [
+  "black {args:.}",
+  "ruff check --fix {args:.}",
+  "nbqa black notebooks/*",
+  "style",
+]
 
 [tool.black]
 target-version = ["py38"]
@@ -102,60 +121,45 @@ line-length = 120
 extend-include = ["*.ipynb"]
 
 [tool.ruff.lint]
-select = [
-  "A",
-  "ARG",
-  "B",
-  "C",
-  "DTZ",
-  "E",
-  "EM",
-  "F",
-  "I",
-  "ICN",
-  "ISC",
-  "N",
-  "PLC",
-  "PLE",
-  "PLR",
-  "PLW",
-  "Q",
-  "RUF",
-  "S",
-  "T",
-  "TID",
-  "UP",
-  "W",
-  "YTT",
-]
+select = ["ALL"]
 ignore = [
   # Allow non-abstract empty methods in abstract base classes
   "B027",
+  # No required doctstring for modules, packages
+  "D100",
+  "D104",
+  # No future annotations
+  "FA100",
   # Ignore checks for possible passwords
   "S105",
   "S106",
   "S107",
   # Ignore complexity
   "C901",
+  # Generic veriable name df is ok
+  "PD901",
   "PLR0911",
   "PLR0912",
   "PLR0913",
   "PLR0915",
 ]
-unfixable = [
-  # Don't touch unused imports
-  "F401",
-]
 
 [tool.ruff.lint.isort]
 known-first-party = ["dataframes_haystack"]
 
 [tool.ruff.lint.flake8-tidy-imports]
 ban-relative-imports = "all"
 
+[tool.ruff.lint.pydocstyle]
+convention = "google"
+
+[tool.ruff.format]
+docstring-code-format = true
+
 [tool.ruff.lint.per-file-ignores]
 # Tests can use magic values, assertions, and relative imports
-"tests/**/*" = ["PLR2004", "S101", "TID252"]
+"tests/*" = ["PLR2004", "S101", "TID252", "D100", "D103"]
+"notebooks/*" = ["PTH123", "SIM115"]
 
 
 # Test coverage
@@ -168,7 +172,10 @@ omit = [
 ]
 
 [tool.coverage.paths]
-dataframes_haystack = ["src/dataframes_haystack", "*/dataframes-haystack/src/dataframes_haystack"]
+dataframes_haystack = [
+  "src/dataframes_haystack",
+  "*/dataframes-haystack/src/dataframes_haystack",
+]
 tests = ["tests", "*/dataframes-haystack/tests"]
 
 [tool.coverage.report]

diff --git a/src/dataframes_haystack/components/converters/__init__.py b/src/dataframes_haystack/components/converters/__init__.py
@@ -0,0 +1,5 @@
+from dataframes_haystack.components.converters._common import DataFrameFileToDocument
+
+__all__ = [
+    "DataFrameFileToDocument",
+]
diff --git a/src/dataframes_haystack/components/converters/_common.py b/src/dataframes_haystack/components/converters/_common.py
@@ -0,0 +1,112 @@
+import logging
+from functools import partial
+from typing import Any, Dict, List, Literal, Optional, Union
+
+import narwhals.stable.v1 as nw
+from haystack import Document, component
+
+from dataframes_haystack.components.converters._utils import (
+    FileFormat,
+    ReaderFunc,
+    frame_to_documents,
+    get_pandas_readers_map,
+    get_polars_readers_map,
+    read_with_select,
+)
+
+logger = logging.getLogger(__name__)
+
+Backends = Literal["pandas", "polars"]
+
+
+@component
+class DataFrameFileToDocument:
+    """Reads files and converts their data in Documents.
+
+    Usage example:
+    ```python
+    from dataframes_haystack.components.converters import DataFrameFileToDocument
+
+    converter = DataFrameFileToDocument(content_column="text_str")
+    results = converter.run(files=["file1.csv", "file2.csv"])
+    documents = results["documents"]
+    print(documents[0].content)
+    ```
+    """
+
+    def __init__(
+        self,
+        content_column: str,
+        meta_columns: Union[List[str], None] = None,
+        index_column: Union[str, None] = None,
+        file_format: FileFormat = "csv",
+        read_kwargs: Optional[Dict[str, Any]] = None,
+        backend: Backends = "polars",
+    ) -> None:
+        """Create a DataFrameFileToDocument component.
+
+        Args:
+            content_column: The name of the DataFrame column that contains the text content.
+            meta_columns: Optional list of names of the DataFrame columns that contain metadata.
+            index_column: The name of the DataFrame column that contains the index.
+            file_format: The format of the files to read.
+            read_kwargs: Optional keyword arguments to pass to the file reader function.
+            backend: The backend to use for reading the files.
+        """
+        self.content_column = content_column
+        self.meta_columns = meta_columns or []
+        self.index_column = index_column
+        self.file_format = file_format
+        self.read_kwargs = read_kwargs or {}
+        self.backend = backend
+        if self.backend not in ["pandas", "polars"]:
+            msg = f"Unsupported backend: {self.backend}"
+            raise ValueError(msg)
+        self._reader_function = self._get_reader_function()
+
+    def _get_reader_function(self) -> ReaderFunc:
+        file_format_mapping = get_pandas_readers_map() if self.backend == "pandas" else get_polars_readers_map()
+        reader_function = file_format_mapping.get(self.file_format)
+        if reader_function:
+            return reader_function
+        msg = f"Unsupported file format for {self.backend} backend: {self.file_format}"
+        raise ValueError(msg)
+
+    def _run_read(self, file_paths: List[str]) -> nw.DataFrame:
+        selected_columns = [self.index_column, self.content_column, *self.meta_columns]
+        selected_columns = [col for col in selected_columns if col is not None]
+        read_func = partial(self._reader_function, **self.read_kwargs)
+        df_list = [read_with_select(read_func, file_path=path, columns_subset=selected_columns) for path in file_paths]
+        return nw.concat(df_list, how="vertical")
+
+    @component.output_types(documents=List[Document])
+    def run(
+        self,
+        file_paths: List[str],
+        meta: Union[Dict[str, Any], List[Dict[str, Any]], None] = None,
+    ) -> Dict[str, List[Document]]:
+        """Reads files and converts their data in Documents.
+
+        Args:
+            file_paths: List of file paths to read.
+            meta:
+                Optional metadata to attach to the Documents.
+                This value can be either a dictionary or a list of dictionaries.
+                If it's a dictionary, its content is added to the metadata of all produced Documents.
+                If it's a list, the length of the list must match the number of rows in the DataFrame,
+                because the two lists will be zipped.
+
+        Returns:
+            A dictionary with the following keys:
+            - `documents`: Created Documents
+        """
+        df = self._run_read(file_paths)
+        documents = frame_to_documents(
+            df,
+            content_column=self.content_column,
+            meta_columns=self.meta_columns,
+            index_column=self.index_column,
+            extra_metadata=meta,
+        )
+
+        return {"documents": documents}