|
4 | 4 | # which is available at https://spdx.org/licenses/BSD-3-Clause-Clear.html or # |
5 | 5 | # see the "LICENSE.md" file for more details. # |
6 | 6 | ###################################################################################### |
7 | | -"""Test consistency of the created files with the input data""" |
8 | 7 | import os |
9 | 8 | import shutil |
10 | 9 | import unittest |
@@ -700,3 +699,47 @@ def _test_domain_coherence(self, ds, ref_var_types): |
700 | 699 | for var in out_domain.get_dictionary(table.name).variables |
701 | 700 | } |
702 | 701 | self.assertEqual(ref_var_types[table.name], out_dictionary_var_types) |
| 702 | + |
| 703 | + |
| 704 | +class DataFramePreprocessingTests(unittest.TestCase): |
| 705 | + """Check that the preprocessing of X (input features collection) is actually done |
| 706 | + when writing the csv used later by Khiops |
| 707 | + """ |
| 708 | + |
| 709 | + def setUp(self): |
| 710 | + """Set-up test-specific output directory""" |
| 711 | + self.output_dir = os.path.join("resources", "tmp", self._testMethodName) |
| 712 | + os.makedirs(self.output_dir, exist_ok=True) |
| 713 | + |
| 714 | + def tearDown(self): |
| 715 | + """Clean-up test-specific output directory""" |
| 716 | + shutil.rmtree(self.output_dir, ignore_errors=True) |
| 717 | + del self.output_dir |
| 718 | + |
| 719 | + @staticmethod |
| 720 | + def create_monotable_dataset_with_newlines(): |
| 721 | + data = { |
| 722 | + "User_ID": [ |
| 723 | + "Cm6fu01r99", |
| 724 | + ], |
| 725 | + "Age": [39], |
| 726 | + "Title": [ |
| 727 | + "Shimmer,\nsurprisingly\n\rgoes with lots", |
| 728 | + ], |
| 729 | + } |
| 730 | + dataset = pd.DataFrame(data) |
| 731 | + return dataset |
| 732 | + |
| 733 | + def test_newlines_removed_from_csv_file_for_khiops(self): |
| 734 | + dataset = Dataset( |
| 735 | + DataFramePreprocessingTests.create_monotable_dataset_with_newlines() |
| 736 | + ) |
| 737 | + |
| 738 | + out_table_path, _ = dataset.create_table_files_for_khiops(self.output_dir) |
| 739 | + out_table = pd.read_csv(out_table_path, sep="\t") |
| 740 | + |
| 741 | + self.assertEqual( |
| 742 | + "Shimmer, surprisingly goes with lots", |
| 743 | + out_table.Title[0], |
| 744 | + "Newlines should have been removed from the data", |
| 745 | + ) |
0 commit comments