From 836b39b3b154a38f32339c17f997b97e40d4ac88 Mon Sep 17 00:00:00 2001 From: Jesper Zedlitz Date: Tue, 13 Dec 2022 10:14:33 +0100 Subject: [PATCH] added an optional parameter to specify the character encoding of a source --- .../frictionlessdata/tableschema/Table.java | 29 +++++++++++++++---- .../AbstractTableDataSource.java | 3 +- .../tabledatasource/TableDataSource.java | 16 +++++----- .../table_tests/TableEncodingTests.java | 7 +++-- .../JsonArrayTableDataSourceTest.java | 3 +- .../TableDataSourceFormatsTest.java | 5 ++-- .../fixtures/csv/encodings/iso8859.csv | 2 ++ 7 files changed, 45 insertions(+), 20 deletions(-) create mode 100644 src/test/resources/fixtures/csv/encodings/iso8859.csv diff --git a/src/main/java/io/frictionlessdata/tableschema/Table.java b/src/main/java/io/frictionlessdata/tableschema/Table.java index 0926fe4a..25a4fff8 100644 --- a/src/main/java/io/frictionlessdata/tableschema/Table.java +++ b/src/main/java/io/frictionlessdata/tableschema/Table.java @@ -18,6 +18,8 @@ import java.io.*; import java.net.URL; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.*; /** @@ -123,8 +125,8 @@ public static Table fromSource(InputStream data, InputStream schema, CSVFormat f * @param format The expected CSVFormat if dataSource is a CSV-containing InputStream; ignored for JSON data. * Can be `null` */ - public static Table fromSource(File dataSource, File basePath, Schema schema, CSVFormat format) { - Table table = fromSource(dataSource, basePath); + public static Table fromSource(File dataSource, File basePath, Schema schema, CSVFormat format, Charset charset) { + Table table = fromSource(dataSource, basePath, charset); table.schema = schema; if (null != format) { table.setCsvFormat(format); @@ -132,6 +134,23 @@ public static Table fromSource(File dataSource, File basePath, Schema schema, CS return table; } + public static Table fromSource(File dataSource, File basePath, Schema schema, CSVFormat format) { + return fromSource(dataSource, basePath, schema, format, null); + } + + /** + * Create Table from a {@link java.io.File} containing the CSV/JSON + * data and without either a Schema or a CSVFormat. + * @param dataSource relative File for reading the data from. Must be inside `basePath` + * @param basePath Parent directory + * @param charset Character encoding of the file + */ + public static Table fromSource(File dataSource, File basePath, Charset charset) { + Table table = new Table(); + table.dataSource = TableDataSource.fromSource(dataSource, basePath, charset); + return table; + } + /** * Create Table from a {@link java.io.File} containing the CSV/JSON * data and without either a Schema or a CSVFormat. @@ -139,9 +158,7 @@ public static Table fromSource(File dataSource, File basePath, Schema schema, CS * @param basePath Parent directory */ public static Table fromSource(File dataSource, File basePath) { - Table table = new Table(); - table.dataSource = TableDataSource.fromSource(dataSource, basePath); - return table; + return fromSource(dataSource, basePath, null); } /** @@ -729,4 +746,4 @@ private void writeCSVData(Map mapping, CSVPrinter csvPrinter) } }); } -} \ No newline at end of file +} diff --git a/src/main/java/io/frictionlessdata/tableschema/tabledatasource/AbstractTableDataSource.java b/src/main/java/io/frictionlessdata/tableschema/tabledatasource/AbstractTableDataSource.java index 2cddb4bf..7e80fbba 100644 --- a/src/main/java/io/frictionlessdata/tableschema/tabledatasource/AbstractTableDataSource.java +++ b/src/main/java/io/frictionlessdata/tableschema/tabledatasource/AbstractTableDataSource.java @@ -3,6 +3,7 @@ import java.io.File; import java.io.IOException; import java.net.URL; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -32,7 +33,7 @@ public List getDataAsStringArray() { } String getFileContents(String path) throws IOException { - return TableDataSource.getFileContents(path, workDir); + return TableDataSource.getFileContents(path, workDir, Charset.defaultCharset()); } } diff --git a/src/main/java/io/frictionlessdata/tableschema/tabledatasource/TableDataSource.java b/src/main/java/io/frictionlessdata/tableschema/tabledatasource/TableDataSource.java index 3cf8b829..800ac770 100644 --- a/src/main/java/io/frictionlessdata/tableschema/tabledatasource/TableDataSource.java +++ b/src/main/java/io/frictionlessdata/tableschema/tabledatasource/TableDataSource.java @@ -1,6 +1,7 @@ package io.frictionlessdata.tableschema.tabledatasource; import com.fasterxml.jackson.databind.node.ArrayNode; +import com.google.common.primitives.Chars; import io.frictionlessdata.tableschema.Table; import io.frictionlessdata.tableschema.exception.TableIOException; import io.frictionlessdata.tableschema.inputstream.ByteOrderMarkStrippingInputStream; @@ -8,6 +9,7 @@ import org.apache.commons.csv.CSVFormat; import java.io.*; +import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; import java.util.HashMap; @@ -81,9 +83,9 @@ static TableDataSource fromSource(String input) { * {@link CsvTableDataSource} based on input format * @return DataSource created from input File */ - static TableDataSource fromSource(File input, File workDir) { + static TableDataSource fromSource(File input, File workDir, Charset charset) { try { - String content = getFileContents(input.getPath(), workDir); + String content = getFileContents(input.getPath(), workDir, charset); return fromSource(content); } catch (IOException ex) { throw new TableIOException(ex); @@ -109,7 +111,7 @@ static TableDataSource fromSource(InputStream input) { return fromSource(content); } - static String getFileContents(String path, File workDir) throws IOException { + static String getFileContents(String path, File workDir, Charset charset) throws IOException { String lines; if (workDir.getName().endsWith(".zip")) { //have to exchange the backslashes on Windows, as @@ -119,7 +121,7 @@ static String getFileContents(String path, File workDir) throws IOException { ZipFile zipFile = new ZipFile(workDir.getAbsolutePath()); ZipEntry entry = zipFile.getEntry(path); InputStream stream = zipFile.getInputStream(entry); - lines = readSkippingBOM(stream); + lines = readSkippingBOM(stream, charset); } else { // The path value can either be a relative path or a full path. // If it's a relative path then build the full path by using the working directory. @@ -129,7 +131,7 @@ static String getFileContents(String path, File workDir) throws IOException { // - https://github.com/frictionlessdata/tableschema-java/issues/29 // - https://frictionlessdata.io/specs/data-resource/#url-or-path Path resolvedPath = TableDataSource.toSecure(new File(path).toPath(), workDir.toPath()); - lines = readSkippingBOM(new FileInputStream(resolvedPath.toFile())); + lines = readSkippingBOM(new FileInputStream(resolvedPath.toFile()), charset); } return lines; } @@ -141,10 +143,10 @@ static String getFileContents(String path, File workDir) throws IOException { * @param is InputStream to read from * @return Contents of the InputStream as a String */ - static String readSkippingBOM(InputStream is) { + static String readSkippingBOM(InputStream is, Charset charset) { String content; try (ByteOrderMarkStrippingInputStream bims = new ByteOrderMarkStrippingInputStream(is); - InputStreamReader isr = new InputStreamReader(bims.skipBOM(), bims.getCharset()); + InputStreamReader isr = new InputStreamReader(bims.skipBOM(), charset == null ? bims.getCharset() : charset); BufferedReader rdr = new BufferedReader(isr)) { content = rdr.lines().collect(Collectors.joining("\n")); } catch (IOException ex) { diff --git a/src/test/java/io/frictionlessdata/tableschema/table_tests/TableEncodingTests.java b/src/test/java/io/frictionlessdata/tableschema/table_tests/TableEncodingTests.java index 7371c8ea..4989da35 100644 --- a/src/test/java/io/frictionlessdata/tableschema/table_tests/TableEncodingTests.java +++ b/src/test/java/io/frictionlessdata/tableschema/table_tests/TableEncodingTests.java @@ -7,8 +7,10 @@ import org.junit.jupiter.api.Test; import java.io.File; +import java.nio.charset.StandardCharsets; import java.util.Iterator; +import static io.frictionlessdata.tableschema.TestHelper.getTestDataDirectory; import static io.frictionlessdata.tableschema.TestHelper.getTestsuiteDataDirectory; public class TableEncodingTests { @@ -17,12 +19,11 @@ public class TableEncodingTests { // currently disabled @Test @DisplayName("Create a Table from a ISO-8859-1 encoded file") - @Disabled void createTableFromIso8859() throws Exception{ - File testDataDir = getTestsuiteDataDirectory(); + File testDataDir = getTestDataDirectory(); Table table - = Table.fromSource(new File("csv/encodings/iso8859.csv"), testDataDir, null, null); + = Table.fromSource(new File("csv/encodings/iso8859.csv"), testDataDir, null, null, StandardCharsets.ISO_8859_1); Iterator iter = table.iterator(); Object[] row = iter.next(); diff --git a/src/test/java/io/frictionlessdata/tableschema/tabledatasource/JsonArrayTableDataSourceTest.java b/src/test/java/io/frictionlessdata/tableschema/tabledatasource/JsonArrayTableDataSourceTest.java index dfcaef30..fda3adb5 100644 --- a/src/test/java/io/frictionlessdata/tableschema/tabledatasource/JsonArrayTableDataSourceTest.java +++ b/src/test/java/io/frictionlessdata/tableschema/tabledatasource/JsonArrayTableDataSourceTest.java @@ -9,6 +9,7 @@ import org.junit.jupiter.api.Test; import java.io.*; +import java.nio.charset.Charset; import java.nio.file.Files; import java.util.stream.Collectors; @@ -74,7 +75,7 @@ void testJsonArrayDataSourceHeaders() throws Exception{ @DisplayName("Validate creating a JsonArrayTableDataSource from JSON file") void testSafePathCreationJson() throws Exception { TableDataSource ds = TableDataSource.fromSource(new File("simple_geojson.json"), - TestHelper.getTestDataDirectory()); + TestHelper.getTestDataDirectory(), Charset.defaultCharset()); Assertions.assertNotNull(ds); } /* diff --git a/src/test/java/io/frictionlessdata/tableschema/tabledatasource/TableDataSourceFormatsTest.java b/src/test/java/io/frictionlessdata/tableschema/tabledatasource/TableDataSourceFormatsTest.java index 6434ffbf..16e16751 100644 --- a/src/test/java/io/frictionlessdata/tableschema/tabledatasource/TableDataSourceFormatsTest.java +++ b/src/test/java/io/frictionlessdata/tableschema/tabledatasource/TableDataSourceFormatsTest.java @@ -8,6 +8,7 @@ import java.io.*; import java.net.URL; +import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -85,7 +86,7 @@ public void testSafePath() throws Exception { @Test @DisplayName("Create a TableDataSource from a safe path and ensure no exception is thrown") public void testSafePathCreationCsv() throws Exception { - TableDataSource ds = TableDataSource.fromSource(new File ("data/population.csv"), TestHelper.getTestDataDirectory()); + TableDataSource ds = TableDataSource.fromSource(new File ("data/population.csv"), TestHelper.getTestDataDirectory(), Charset.defaultCharset()); Assertions.assertNotNull(ds); } @@ -162,7 +163,7 @@ public void testZipInputFileCreationCsv2() throws Exception { TableDataSource ds; File basePath = new File(TestHelper.getTestDataDirectory(),"data/population.zip"); File inFile = new File("population.csv"); - ds = TableDataSource.fromSource(inFile,basePath); + ds = TableDataSource.fromSource(inFile,basePath, null); List data = ds.getDataAsStringArray(); Assertions.assertNotNull(data); byte[] bytes = Files.readAllBytes(new File(TestHelper.getTestDataDirectory(), "data/population.csv").toPath()); diff --git a/src/test/resources/fixtures/csv/encodings/iso8859.csv b/src/test/resources/fixtures/csv/encodings/iso8859.csv new file mode 100644 index 00000000..2d0ec6a0 --- /dev/null +++ b/src/test/resources/fixtures/csv/encodings/iso8859.csv @@ -0,0 +1,2 @@ +name +Réunion