Infer CSV delimiter if it hasn't been set explicitly

Closes #141
This commit is contained in:
Markus Lanthaler 2017-04-17 18:51:53 +02:00 committed by Adrien Crivelli
parent 0bd3a9c60a
commit 3ee9cc5ce6
No known key found for this signature in database
GPG Key ID: B182FD79DC6DE92E
6 changed files with 112 additions and 6 deletions

View File

@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
- Initial implementation of SUMIFS() function
- Additional codepages
- MemoryDrawing not working in HTML writer [#808](https://github.com/PHPOffice/PHPExcel/issues/808)
- CSV Reader can auto-detect the separator used in file [#141](https://github.com/PHPOffice/PhpSpreadsheet/pull/141)
### Changed

View File

@ -435,11 +435,15 @@ $spreadsheet = $reader->load("sample.csv");
#### Setting CSV options
Often, CSV files are not really "comma separated", or use semicolon (;)
Often, CSV files are not really "comma separated", or use semicolon (`;`)
as a separator. You can instruct
\PhpOffice\PhpSpreadsheet\Reader\Csv some options before reading a CSV
file.
The separator will be auto-detected, so in most cases it should not be necessary
to specify it. But in cases where auto-detection does not fit the use-case, then
it can be set manually.
Note that \PhpOffice\PhpSpreadsheet\Reader\Csv by default assumes that
the loaded CSV file is UTF-8 encoded. If you are reading CSV files that
were created in Microsoft Office Excel the correct input encoding may

View File

@ -523,15 +523,17 @@ CSV | YES | HTML | NO
### Pipe or Tab Separated Value Files
The CSV loader defaults to loading a file where comma is used as the
separator, but you can modify this to load tab- or pipe-separated value
files using the `setDelimiter()` method.
The CSV loader will attempt to auto-detect the separator used in the file. If it
cannot auto-detect, it will default to the comma. If this does not fit your
use-case, you can manually specify a separator by using the `setDelimiter()`
method.
``` php
$inputFileType = 'Csv';
$inputFileName = './sampleData/example1.tsv';
/** Create a new Reader of the type defined in $inputFileType **/ $reader = \PhpOffice\PhpSpreadsheet\IOFactory::createReader($inputFileType);
/** Create a new Reader of the type defined in $inputFileType **/
$reader = \PhpOffice\PhpSpreadsheet\IOFactory::createReader($inputFileType);
/** Set the delimiter to a TAB character **/
$reader->setDelimiter("\t");
// $reader->setDelimiter('|');

View File

@ -40,7 +40,7 @@ class Csv extends BaseReader implements IReader
*
* @var string
*/
private $delimiter = ',';
private $delimiter = null;
/**
* Enclosure.
@ -152,6 +152,86 @@ class Csv extends BaseReader implements IReader
return $this->skipBOM();
}
/**
* Infer the separator if it isn't explicitly set in the file or specified by the user.
*/
protected function inferSeparator()
{
if ($this->delimiter !== null) {
return;
}
$potentialDelimiters = [',', ';', "\t", '|', ':', ' '];
$counts = [];
foreach ($potentialDelimiters as $delimiter) {
$counts[$delimiter] = [];
}
// Count how many times each of the potential delimiters appears in each line
$numberLines = 0;
while (($line = fgets($this->fileHandle)) !== false && (++$numberLines < 1000)) {
$countLine = [];
for ($i = strlen($line) - 1; $i >= 0; --$i) {
$char = $line[$i];
if (isset($counts[$char])) {
if (!isset($countLine[$char])) {
$countLine[$char] = 0;
}
++$countLine[$char];
}
}
foreach ($potentialDelimiters as $delimiter) {
$counts[$delimiter][] = isset($countLine[$delimiter])
? $countLine[$delimiter]
: 0;
}
}
// Calculate the mean square deviations for each delimiter (ignoring delimiters that haven't been found consistently)
$meanSquareDeviations = [];
$middleIdx = floor(($numberLines - 1) / 2);
foreach ($potentialDelimiters as $delimiter) {
$series = $counts[$delimiter];
sort($series);
$median = ($numberLines % 2)
? $series[$middleIdx]
: ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;
if ($median === 0) {
continue;
}
$meanSquareDeviations[$delimiter] = array_reduce(
$series,
function ($sum, $value) use ($median) {
return $sum + pow($value - $median, 2);
}
) / count($series);
}
// ... and pick the delimiter with the smallest mean square deviation (in case of ties, the order in potentialDelimiters is respected)
$min = INF;
foreach ($potentialDelimiters as $delimiter) {
if (!isset($meanSquareDeviations[$delimiter])) {
continue;
}
if ($meanSquareDeviations[$delimiter] < $min) {
$min = $meanSquareDeviations[$delimiter];
$this->delimiter = $delimiter;
}
}
// If no delimiter could be detected, fall back to the default
if ($this->delimiter === null) {
$this->delimiter = reset($potentialDelimiters);
}
return $this->skipBOM();
}
/**
* Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns).
*
@ -171,6 +251,7 @@ class Csv extends BaseReader implements IReader
// Skip BOM, if any
$this->skipBOM();
$this->checkSeparator();
$this->inferSeparator();
$worksheetInfo = [];
$worksheetInfo[0]['worksheetName'] = 'Worksheet';
@ -237,6 +318,7 @@ class Csv extends BaseReader implements IReader
// Skip BOM, if any
$this->skipBOM();
$this->checkSeparator();
$this->inferSeparator();
// Create new PhpSpreadsheet object
while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {

View File

@ -24,4 +24,18 @@ class CsvTest extends \PHPUnit_Framework_TestCase
$actual = $reloadedSpreadsheet->getActiveSheet()->getCell('A1')->getCalculatedValue();
$this->assertSame($value, $actual, 'should be able to write and read strings with multiples quotes');
}
public function testDelimiterDetection()
{
$reader = new \PhpOffice\PhpSpreadsheet\Reader\Csv();
$this->assertNull($reader->getDelimiter());
$filename = __DIR__ . '/../../data/Reader/CSV/semicolon_separated.csv';
$spreadsheet = $reader->load($filename);
$this->assertSame(';', $reader->getDelimiter(), 'should be able to infer the delimiter');
$actual = $spreadsheet->getActiveSheet()->getCell('C2')->getValue();
$this->assertSame('25,5', $actual, 'should be able to retrieve values with commas');
}
}

View File

@ -0,0 +1,3 @@
This;Are;Headers
Cell A2;Number with comma;25,5
Two colons and a comma;B|3;:,:
1 This Are Headers
2 Cell A2 Number with comma 25,5
3 Two colons and a comma B|3 :,: