parent
0bd3a9c60a
commit
3ee9cc5ce6
|
@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
|
||||||
- Initial implementation of SUMIFS() function
|
- Initial implementation of SUMIFS() function
|
||||||
- Additional codepages
|
- Additional codepages
|
||||||
- MemoryDrawing not working in HTML writer [#808](https://github.com/PHPOffice/PHPExcel/issues/808)
|
- MemoryDrawing not working in HTML writer [#808](https://github.com/PHPOffice/PHPExcel/issues/808)
|
||||||
|
- CSV Reader can auto-detect the separator used in file [#141](https://github.com/PHPOffice/PhpSpreadsheet/pull/141)
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
|
|
||||||
|
|
|
@ -435,11 +435,15 @@ $spreadsheet = $reader->load("sample.csv");
|
||||||
|
|
||||||
#### Setting CSV options
|
#### Setting CSV options
|
||||||
|
|
||||||
Often, CSV files are not really "comma separated", or use semicolon (;)
|
Often, CSV files are not really "comma separated", or use semicolon (`;`)
|
||||||
as a separator. You can instruct
|
as a separator. You can instruct
|
||||||
\PhpOffice\PhpSpreadsheet\Reader\Csv some options before reading a CSV
|
\PhpOffice\PhpSpreadsheet\Reader\Csv some options before reading a CSV
|
||||||
file.
|
file.
|
||||||
|
|
||||||
|
The separator will be auto-detected, so in most cases it should not be necessary
|
||||||
|
to specify it. But in cases where auto-detection does not fit the use-case, then
|
||||||
|
it can be set manually.
|
||||||
|
|
||||||
Note that \PhpOffice\PhpSpreadsheet\Reader\Csv by default assumes that
|
Note that \PhpOffice\PhpSpreadsheet\Reader\Csv by default assumes that
|
||||||
the loaded CSV file is UTF-8 encoded. If you are reading CSV files that
|
the loaded CSV file is UTF-8 encoded. If you are reading CSV files that
|
||||||
were created in Microsoft Office Excel the correct input encoding may
|
were created in Microsoft Office Excel the correct input encoding may
|
||||||
|
|
|
@ -523,15 +523,17 @@ CSV | YES | HTML | NO
|
||||||
|
|
||||||
### Pipe or Tab Separated Value Files
|
### Pipe or Tab Separated Value Files
|
||||||
|
|
||||||
The CSV loader defaults to loading a file where comma is used as the
|
The CSV loader will attempt to auto-detect the separator used in the file. If it
|
||||||
separator, but you can modify this to load tab- or pipe-separated value
|
cannot auto-detect, it will default to the comma. If this does not fit your
|
||||||
files using the `setDelimiter()` method.
|
use-case, you can manually specify a separator by using the `setDelimiter()`
|
||||||
|
method.
|
||||||
|
|
||||||
``` php
|
``` php
|
||||||
$inputFileType = 'Csv';
|
$inputFileType = 'Csv';
|
||||||
$inputFileName = './sampleData/example1.tsv';
|
$inputFileName = './sampleData/example1.tsv';
|
||||||
|
|
||||||
/** Create a new Reader of the type defined in $inputFileType **/ $reader = \PhpOffice\PhpSpreadsheet\IOFactory::createReader($inputFileType);
|
/** Create a new Reader of the type defined in $inputFileType **/
|
||||||
|
$reader = \PhpOffice\PhpSpreadsheet\IOFactory::createReader($inputFileType);
|
||||||
/** Set the delimiter to a TAB character **/
|
/** Set the delimiter to a TAB character **/
|
||||||
$reader->setDelimiter("\t");
|
$reader->setDelimiter("\t");
|
||||||
// $reader->setDelimiter('|');
|
// $reader->setDelimiter('|');
|
||||||
|
|
|
@ -40,7 +40,7 @@ class Csv extends BaseReader implements IReader
|
||||||
*
|
*
|
||||||
* @var string
|
* @var string
|
||||||
*/
|
*/
|
||||||
private $delimiter = ',';
|
private $delimiter = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Enclosure.
|
* Enclosure.
|
||||||
|
@ -152,6 +152,86 @@ class Csv extends BaseReader implements IReader
|
||||||
return $this->skipBOM();
|
return $this->skipBOM();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Infer the separator if it isn't explicitly set in the file or specified by the user.
|
||||||
|
*/
|
||||||
|
protected function inferSeparator()
|
||||||
|
{
|
||||||
|
if ($this->delimiter !== null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
$potentialDelimiters = [',', ';', "\t", '|', ':', ' '];
|
||||||
|
$counts = [];
|
||||||
|
foreach ($potentialDelimiters as $delimiter) {
|
||||||
|
$counts[$delimiter] = [];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Count how many times each of the potential delimiters appears in each line
|
||||||
|
$numberLines = 0;
|
||||||
|
while (($line = fgets($this->fileHandle)) !== false && (++$numberLines < 1000)) {
|
||||||
|
$countLine = [];
|
||||||
|
for ($i = strlen($line) - 1; $i >= 0; --$i) {
|
||||||
|
$char = $line[$i];
|
||||||
|
if (isset($counts[$char])) {
|
||||||
|
if (!isset($countLine[$char])) {
|
||||||
|
$countLine[$char] = 0;
|
||||||
|
}
|
||||||
|
++$countLine[$char];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
foreach ($potentialDelimiters as $delimiter) {
|
||||||
|
$counts[$delimiter][] = isset($countLine[$delimiter])
|
||||||
|
? $countLine[$delimiter]
|
||||||
|
: 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate the mean square deviations for each delimiter (ignoring delimiters that haven't been found consistently)
|
||||||
|
$meanSquareDeviations = [];
|
||||||
|
$middleIdx = floor(($numberLines - 1) / 2);
|
||||||
|
|
||||||
|
foreach ($potentialDelimiters as $delimiter) {
|
||||||
|
$series = $counts[$delimiter];
|
||||||
|
sort($series);
|
||||||
|
|
||||||
|
$median = ($numberLines % 2)
|
||||||
|
? $series[$middleIdx]
|
||||||
|
: ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;
|
||||||
|
|
||||||
|
if ($median === 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
$meanSquareDeviations[$delimiter] = array_reduce(
|
||||||
|
$series,
|
||||||
|
function ($sum, $value) use ($median) {
|
||||||
|
return $sum + pow($value - $median, 2);
|
||||||
|
}
|
||||||
|
) / count($series);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ... and pick the delimiter with the smallest mean square deviation (in case of ties, the order in potentialDelimiters is respected)
|
||||||
|
$min = INF;
|
||||||
|
foreach ($potentialDelimiters as $delimiter) {
|
||||||
|
if (!isset($meanSquareDeviations[$delimiter])) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($meanSquareDeviations[$delimiter] < $min) {
|
||||||
|
$min = $meanSquareDeviations[$delimiter];
|
||||||
|
$this->delimiter = $delimiter;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no delimiter could be detected, fall back to the default
|
||||||
|
if ($this->delimiter === null) {
|
||||||
|
$this->delimiter = reset($potentialDelimiters);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $this->skipBOM();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns).
|
* Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns).
|
||||||
*
|
*
|
||||||
|
@ -171,6 +251,7 @@ class Csv extends BaseReader implements IReader
|
||||||
// Skip BOM, if any
|
// Skip BOM, if any
|
||||||
$this->skipBOM();
|
$this->skipBOM();
|
||||||
$this->checkSeparator();
|
$this->checkSeparator();
|
||||||
|
$this->inferSeparator();
|
||||||
|
|
||||||
$worksheetInfo = [];
|
$worksheetInfo = [];
|
||||||
$worksheetInfo[0]['worksheetName'] = 'Worksheet';
|
$worksheetInfo[0]['worksheetName'] = 'Worksheet';
|
||||||
|
@ -237,6 +318,7 @@ class Csv extends BaseReader implements IReader
|
||||||
// Skip BOM, if any
|
// Skip BOM, if any
|
||||||
$this->skipBOM();
|
$this->skipBOM();
|
||||||
$this->checkSeparator();
|
$this->checkSeparator();
|
||||||
|
$this->inferSeparator();
|
||||||
|
|
||||||
// Create new PhpSpreadsheet object
|
// Create new PhpSpreadsheet object
|
||||||
while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
|
while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
|
||||||
|
|
|
@ -24,4 +24,18 @@ class CsvTest extends \PHPUnit_Framework_TestCase
|
||||||
$actual = $reloadedSpreadsheet->getActiveSheet()->getCell('A1')->getCalculatedValue();
|
$actual = $reloadedSpreadsheet->getActiveSheet()->getCell('A1')->getCalculatedValue();
|
||||||
$this->assertSame($value, $actual, 'should be able to write and read strings with multiples quotes');
|
$this->assertSame($value, $actual, 'should be able to write and read strings with multiples quotes');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function testDelimiterDetection()
|
||||||
|
{
|
||||||
|
$reader = new \PhpOffice\PhpSpreadsheet\Reader\Csv();
|
||||||
|
$this->assertNull($reader->getDelimiter());
|
||||||
|
|
||||||
|
$filename = __DIR__ . '/../../data/Reader/CSV/semicolon_separated.csv';
|
||||||
|
$spreadsheet = $reader->load($filename);
|
||||||
|
|
||||||
|
$this->assertSame(';', $reader->getDelimiter(), 'should be able to infer the delimiter');
|
||||||
|
|
||||||
|
$actual = $spreadsheet->getActiveSheet()->getCell('C2')->getValue();
|
||||||
|
$this->assertSame('25,5', $actual, 'should be able to retrieve values with commas');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
This;Are;Headers
|
||||||
|
Cell A2;Number with comma;25,5
|
||||||
|
Two colons and a comma;B|3;:,:
|
|
Loading…
Reference in New Issue