parent
0bd3a9c60a
commit
3ee9cc5ce6
@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/).
|
||||
- Initial implementation of SUMIFS() function
|
||||
- Additional codepages
|
||||
- MemoryDrawing not working in HTML writer [#808](https://github.com/PHPOffice/PHPExcel/issues/808)
|
||||
- CSV Reader can auto-detect the separator used in file [#141](https://github.com/PHPOffice/PhpSpreadsheet/pull/141)
|
||||
|
||||
### Changed
|
||||
|
||||
|
@ -435,11 +435,15 @@ $spreadsheet = $reader->load("sample.csv");
|
||||
|
||||
#### Setting CSV options
|
||||
|
||||
Often, CSV files are not really "comma separated", or use semicolon (;)
|
||||
Often, CSV files are not really "comma separated", or use semicolon (`;`)
|
||||
as a separator. You can instruct
|
||||
\PhpOffice\PhpSpreadsheet\Reader\Csv some options before reading a CSV
|
||||
file.
|
||||
|
||||
The separator will be auto-detected, so in most cases it should not be necessary
|
||||
to specify it. But in cases where auto-detection does not fit the use-case, then
|
||||
it can be set manually.
|
||||
|
||||
Note that \PhpOffice\PhpSpreadsheet\Reader\Csv by default assumes that
|
||||
the loaded CSV file is UTF-8 encoded. If you are reading CSV files that
|
||||
were created in Microsoft Office Excel the correct input encoding may
|
||||
|
@ -523,15 +523,17 @@ CSV | YES | HTML | NO
|
||||
|
||||
### Pipe or Tab Separated Value Files
|
||||
|
||||
The CSV loader defaults to loading a file where comma is used as the
|
||||
separator, but you can modify this to load tab- or pipe-separated value
|
||||
files using the `setDelimiter()` method.
|
||||
The CSV loader will attempt to auto-detect the separator used in the file. If it
|
||||
cannot auto-detect, it will default to the comma. If this does not fit your
|
||||
use-case, you can manually specify a separator by using the `setDelimiter()`
|
||||
method.
|
||||
|
||||
``` php
|
||||
$inputFileType = 'Csv';
|
||||
$inputFileName = './sampleData/example1.tsv';
|
||||
|
||||
/** Create a new Reader of the type defined in $inputFileType **/ $reader = \PhpOffice\PhpSpreadsheet\IOFactory::createReader($inputFileType);
|
||||
/** Create a new Reader of the type defined in $inputFileType **/
|
||||
$reader = \PhpOffice\PhpSpreadsheet\IOFactory::createReader($inputFileType);
|
||||
/** Set the delimiter to a TAB character **/
|
||||
$reader->setDelimiter("\t");
|
||||
// $reader->setDelimiter('|');
|
||||
|
@ -40,7 +40,7 @@ class Csv extends BaseReader implements IReader
|
||||
*
|
||||
* @var string
|
||||
*/
|
||||
private $delimiter = ',';
|
||||
private $delimiter = null;
|
||||
|
||||
/**
|
||||
* Enclosure.
|
||||
@ -152,6 +152,86 @@ class Csv extends BaseReader implements IReader
|
||||
return $this->skipBOM();
|
||||
}
|
||||
|
||||
/**
|
||||
* Infer the separator if it isn't explicitly set in the file or specified by the user.
|
||||
*/
|
||||
protected function inferSeparator()
|
||||
{
|
||||
if ($this->delimiter !== null) {
|
||||
return;
|
||||
}
|
||||
|
||||
$potentialDelimiters = [',', ';', "\t", '|', ':', ' '];
|
||||
$counts = [];
|
||||
foreach ($potentialDelimiters as $delimiter) {
|
||||
$counts[$delimiter] = [];
|
||||
}
|
||||
|
||||
// Count how many times each of the potential delimiters appears in each line
|
||||
$numberLines = 0;
|
||||
while (($line = fgets($this->fileHandle)) !== false && (++$numberLines < 1000)) {
|
||||
$countLine = [];
|
||||
for ($i = strlen($line) - 1; $i >= 0; --$i) {
|
||||
$char = $line[$i];
|
||||
if (isset($counts[$char])) {
|
||||
if (!isset($countLine[$char])) {
|
||||
$countLine[$char] = 0;
|
||||
}
|
||||
++$countLine[$char];
|
||||
}
|
||||
}
|
||||
foreach ($potentialDelimiters as $delimiter) {
|
||||
$counts[$delimiter][] = isset($countLine[$delimiter])
|
||||
? $countLine[$delimiter]
|
||||
: 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate the mean square deviations for each delimiter (ignoring delimiters that haven't been found consistently)
|
||||
$meanSquareDeviations = [];
|
||||
$middleIdx = floor(($numberLines - 1) / 2);
|
||||
|
||||
foreach ($potentialDelimiters as $delimiter) {
|
||||
$series = $counts[$delimiter];
|
||||
sort($series);
|
||||
|
||||
$median = ($numberLines % 2)
|
||||
? $series[$middleIdx]
|
||||
: ($series[$middleIdx] + $series[$middleIdx + 1]) / 2;
|
||||
|
||||
if ($median === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$meanSquareDeviations[$delimiter] = array_reduce(
|
||||
$series,
|
||||
function ($sum, $value) use ($median) {
|
||||
return $sum + pow($value - $median, 2);
|
||||
}
|
||||
) / count($series);
|
||||
}
|
||||
|
||||
// ... and pick the delimiter with the smallest mean square deviation (in case of ties, the order in potentialDelimiters is respected)
|
||||
$min = INF;
|
||||
foreach ($potentialDelimiters as $delimiter) {
|
||||
if (!isset($meanSquareDeviations[$delimiter])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($meanSquareDeviations[$delimiter] < $min) {
|
||||
$min = $meanSquareDeviations[$delimiter];
|
||||
$this->delimiter = $delimiter;
|
||||
}
|
||||
}
|
||||
|
||||
// If no delimiter could be detected, fall back to the default
|
||||
if ($this->delimiter === null) {
|
||||
$this->delimiter = reset($potentialDelimiters);
|
||||
}
|
||||
|
||||
return $this->skipBOM();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return worksheet info (Name, Last Column Letter, Last Column Index, Total Rows, Total Columns).
|
||||
*
|
||||
@ -171,6 +251,7 @@ class Csv extends BaseReader implements IReader
|
||||
// Skip BOM, if any
|
||||
$this->skipBOM();
|
||||
$this->checkSeparator();
|
||||
$this->inferSeparator();
|
||||
|
||||
$worksheetInfo = [];
|
||||
$worksheetInfo[0]['worksheetName'] = 'Worksheet';
|
||||
@ -237,6 +318,7 @@ class Csv extends BaseReader implements IReader
|
||||
// Skip BOM, if any
|
||||
$this->skipBOM();
|
||||
$this->checkSeparator();
|
||||
$this->inferSeparator();
|
||||
|
||||
// Create new PhpSpreadsheet object
|
||||
while ($spreadsheet->getSheetCount() <= $this->sheetIndex) {
|
||||
|
@ -24,4 +24,18 @@ class CsvTest extends \PHPUnit_Framework_TestCase
|
||||
$actual = $reloadedSpreadsheet->getActiveSheet()->getCell('A1')->getCalculatedValue();
|
||||
$this->assertSame($value, $actual, 'should be able to write and read strings with multiples quotes');
|
||||
}
|
||||
|
||||
public function testDelimiterDetection()
|
||||
{
|
||||
$reader = new \PhpOffice\PhpSpreadsheet\Reader\Csv();
|
||||
$this->assertNull($reader->getDelimiter());
|
||||
|
||||
$filename = __DIR__ . '/../../data/Reader/CSV/semicolon_separated.csv';
|
||||
$spreadsheet = $reader->load($filename);
|
||||
|
||||
$this->assertSame(';', $reader->getDelimiter(), 'should be able to infer the delimiter');
|
||||
|
||||
$actual = $spreadsheet->getActiveSheet()->getCell('C2')->getValue();
|
||||
$this->assertSame('25,5', $actual, 'should be able to retrieve values with commas');
|
||||
}
|
||||
}
|
||||
|
3
tests/data/Reader/CSV/semicolon_separated.csv
Normal file
3
tests/data/Reader/CSV/semicolon_separated.csv
Normal file
@ -0,0 +1,3 @@
|
||||
This;Are;Headers
|
||||
Cell A2;Number with comma;25,5
|
||||
Two colons and a comma;B|3;:,:
|
|
Loading…
Reference in New Issue
Block a user