diff --git a/src/Migration/Sources/CSV.php b/src/Migration/Sources/CSV.php index 8bab5c04..f298dcc9 100644 --- a/src/Migration/Sources/CSV.php +++ b/src/Migration/Sources/CSV.php @@ -180,7 +180,7 @@ private function exportRows(int $batchSize): void } } - $this->withCSVStream(function ($stream) use ($columnTypes, $manyToManyKeys, $arrayKeys, $table, $batchSize) { + $this->withCSVStream(function ($stream, $delimiter) use ($columnTypes, $manyToManyKeys, $arrayKeys, $table, $batchSize) { $headers = fgetcsv($stream); if (! is_array($headers) || count($headers) === 0) { return; @@ -190,7 +190,7 @@ private function exportRows(int $batchSize): void $buffer = []; - while (($csvRowItem = fgetcsv($stream)) !== false) { + while (($csvRowItem = fgetcsv(stream: $stream, separator: $delimiter)) !== false) { if (count($csvRowItem) !== count($headers)) { throw new \Exception('CSV row does not match the number of header columns.'); } @@ -314,7 +314,7 @@ protected function exportGroupFunctions(int $batchSize, array $resources): void } /** - * @param callable(resource $stream): void $callback + * @param callable(resource $stream, string $delimiter): void $callback * @return void * @throws \Exception */ @@ -336,8 +336,10 @@ private function withCsvStream(callable $callback): void return; } + $delimiter = $this->delimiter($stream); + try { - $callback($stream); + $callback($stream, $delimiter); } finally { \fclose($stream); } @@ -402,4 +404,116 @@ private function downloadToLocal( $this->downloaded = true; } + + /** + * @param resource $stream + * @return string + */ + private function delimiter($stream): string + { + /** + * widely used options, from here - + * + * https://stackoverflow.com/a/15946087/6819340 + */ + $delimiters = [',', ';', "\t", '|']; + + $sampleLines = []; + + for ($i = 0; $i < 5 && !feof($stream); $i++) { + $line = fgets($stream); + if ($line === false) { + break; + } + + $line = trim($line); + + // empty line, skip for sampling + if (empty($line)) { + $i--; + continue; + } + + $sampleLines[] = $line; + } + + /** + * reset to top again because we need to process + * the same file later again if everything goes OK here! + */ + rewind($stream); + + if (empty($sampleLines)) { + return ','; + } + + $delimiterScores = []; + + foreach ($delimiters as $delimiter) { + $columnCounts = []; + $totalFields = 0; + $usableFields = 0; + + foreach ($sampleLines as $line) { + // delimiter doesn't exist + if (!str_contains($line, $delimiter)) { + $fields = [$line]; + } else { + $fields = str_getcsv($line, $delimiter); + } + + $fieldCount = count($fields); + $columnCounts[] = $fieldCount; + $totalFields += $fieldCount; + + // Count fields that make some sense i.e. + // longer than 1 char or single alphanumeric + foreach ($fields as $field) { + $trimmed = trim($field); + if (strlen($trimmed) > 1) { + $usableFields++; + } + } + } + + $sampleCount = count($columnCounts); + $avgColumns = $totalFields / $sampleCount; + + // short-circuit + // if the delimiter doesn't split anything + if ($avgColumns <= 1) { + $delimiterScores[$delimiter] = 0; + continue; + } + + // check consistency + if ($sampleCount <= 1) { + $consistencyScore = 1.0; + } else { + $variance = 0; + foreach ($columnCounts as $count) { + $variance += pow($count - $avgColumns, 2); + } + + // oof, math! + $stddev = sqrt($variance / $sampleCount); + $coefficientOfVariation = $stddev / $avgColumns; + + // lower variance = higher score + $consistencyScore = 1.0 / (1.0 + $coefficientOfVariation * 2); + } + + $qualityScore = $totalFields > 0 ? $usableFields / $totalFields : 0.0; + + $delimiterScores[$delimiter] = $consistencyScore * $qualityScore; + } + + // sort as per score + arsort($delimiterScores); + + // get the first + $bestDelimiter = key($delimiterScores); + + return ($bestDelimiter && $delimiterScores[$bestDelimiter] > 0) ? $bestDelimiter : ','; + } } diff --git a/tests/Migration/Unit/General/CSVTest.php b/tests/Migration/Unit/General/CSVTest.php new file mode 100644 index 00000000..7cae1c7b --- /dev/null +++ b/tests/Migration/Unit/General/CSVTest.php @@ -0,0 +1,49 @@ +newInstanceWithoutConstructor(); + + $refMethod = $reflection->getMethod('delimiter'); + + /** @noinspection PhpExpressionResultUnusedInspection */ + $refMethod->setAccessible(true); + + return $refMethod->invoke($instance, $stream); + } + + public function testDetectDelimiter() + { + $cases = [ + ['file' => 'comma.csv', 'expected' => ','], + ['file' => 'single_column.csv', 'expected' => ','], // fallback + ['file' => 'empty.csv', 'expected' => ','], // fallback + ['file' => 'quoted_fields.csv', 'expected' => ','], + ['file' => 'semicolon.csv', 'expected' => ';'], + ['file' => 'tab.csv', 'expected' => "\t"], + ['file' => 'pipe.csv', 'expected' => '|'], + ]; + + foreach ($cases as $case) { + $filepath = self::RESOURCES_DIR . $case['file']; + $stream = fopen($filepath, 'r'); + $delimiter = $this->detectDelimiter($stream); + fclose($stream); + + $this->assertEquals($case['expected'], $delimiter, "Failed for {$case['file']}"); + } + } +} diff --git a/tests/Migration/resources/csv/comma.csv b/tests/Migration/resources/csv/comma.csv new file mode 100644 index 00000000..d9a21f57 --- /dev/null +++ b/tests/Migration/resources/csv/comma.csv @@ -0,0 +1,11 @@ +id,name,age,city,country,email,phone,department,role,joined +1,Alice,23,New York,USA,alice@example.com,555-0101,Engineering,Developer,2020-01-15 +2,Bob,30,London,UK,bob@example.com,555-0102,Data,Analyst,2019-05-20 +3,Charlie,25,Tokyo,Japan,charlie@example.com,555-0103,Product,Manager,2021-08-10 +4,Diana,28,Sydney,Australia,diana@example.com,555-0104,Engineering,Designer,2020-04-12 +5,Ethan,31,Toronto,Canada,ethan@example.com,555-0105,Marketing,Lead,2018-11-23 +6,Faith,27,Berlin,Germany,faith@example.com,555-0106,Finance,Accountant,2017-07-14 +7,George,35,Paris,France,george@example.com,555-0107,Legal,Advisor,2021-02-05 +8,Hannah,26,Singapore,Singapore,hannah@example.com,555-0108,Operations,Coordinator,2019-09-27 +9,Ian,29,Dubai,UAE,ian@example.com,555-0109,Engineering,QA,2022-03-19 +10,Julia,24,Zurich,Switzerland,julia@example.com,555-0110,Research,Scientist,2020-12-01 diff --git a/tests/Migration/resources/csv/empty.csv b/tests/Migration/resources/csv/empty.csv new file mode 100644 index 00000000..e69de29b diff --git a/tests/Migration/resources/csv/pipe.csv b/tests/Migration/resources/csv/pipe.csv new file mode 100644 index 00000000..bd653468 --- /dev/null +++ b/tests/Migration/resources/csv/pipe.csv @@ -0,0 +1,11 @@ +id|name|age|city|country|email|phone|department|role|joined +1|Alice|23|New York|USA|alice@example.com|555-0101|Engineering|Developer|2020-01-15 +2|Bob|30|London|UK|bob@example.com|555-0102|Data|Analyst|2019-05-20 +3|Charlie|25|Tokyo|Japan|charlie@example.com|555-0103|Product|Manager|2021-08-10 +4|Diana|28|Sydney|Australia|diana@example.com|555-0104|Engineering|Designer|2020-04-12 +5|Ethan|31|Toronto|Canada|ethan@example.com|555-0105|Marketing|Lead|2018-11-23 +6|Faith|27|Berlin|Germany|faith@example.com|555-0106|Finance|Accountant|2017-07-14 +7|George|35|Paris|France|george@example.com|555-0107|Legal|Advisor|2021-02-05 +8|Hannah|26|Singapore|Singapore|hannah@example.com|555-0108|Operations|Coordinator|2019-09-27 +9|Ian|29|Dubai|UAE|ian@example.com|555-0109|Engineering|QA|2022-03-19 +10|Julia|24|Zurich|Switzerland|julia@example.com|555-0110|Research|Scientist|2020-12-01 diff --git a/tests/Migration/resources/csv/quoted_fields.csv b/tests/Migration/resources/csv/quoted_fields.csv new file mode 100644 index 00000000..e15fb076 --- /dev/null +++ b/tests/Migration/resources/csv/quoted_fields.csv @@ -0,0 +1,11 @@ +name,description,tags +"Alice","Software Engineer, Team Lead","php,js" +"Bob","Data Analyst; Expert","sql,excel" +"Carol","CTO, Strategic Leader","go,devops,cloud" +"David","Back-end Developer, API specialist","python,flask,django" +"Emma","Fullstack Developer, React/Vue","js,react,vue" +"Frank","DevOps; Site Reliability","docker,k8s,terraform" +"Grace","Product Manager, B2B; B2C","planning,roadmap" +"Hannah","Support Lead, ""Customer Success""","support,crm" +"Ian","QA Engineer, Manual & Automated testing","selenium,pytest" +"Jane","UI/UX Designer, accessibility","figma,sketch" \ No newline at end of file diff --git a/tests/Migration/resources/csv/semicolon.csv b/tests/Migration/resources/csv/semicolon.csv new file mode 100644 index 00000000..b8c2c5fb --- /dev/null +++ b/tests/Migration/resources/csv/semicolon.csv @@ -0,0 +1,11 @@ +id;name;age;city;country;email;phone;department;role;joined +1;Alice;23;New York;USA;alice@example.com;555-0101;Engineering;Developer;2020-01-15 +2;Bob;30;London;UK;bob@example.com;555-0102;Data;Analyst;2019-05-20 +3;Charlie;25;Tokyo;Japan;charlie@example.com;555-0103;Product;Manager;2021-08-10 +4;Diana;28;Sydney;Australia;diana@example.com;555-0104;Engineering;Designer;2020-04-12 +5;Ethan;31;Toronto;Canada;ethan@example.com;555-0105;Marketing;Lead;2018-11-23 +6;Faith;27;Berlin;Germany;faith@example.com;555-0106;Finance;Accountant;2017-07-14 +7;George;35;Paris;France;george@example.com;555-0107;Legal;Advisor;2021-02-05 +8;Hannah;26;Singapore;Singapore;hannah@example.com;555-0108;Operations;Coordinator;2019-09-27 +9;Ian;29;Dubai;UAE;ian@example.com;555-0109;Engineering;QA;2022-03-19 +10;Julia;24;Zurich;Switzerland;julia@example.com;555-0110;Research;Scientist;2020-12-01 diff --git a/tests/Migration/resources/csv/single_column.csv b/tests/Migration/resources/csv/single_column.csv new file mode 100644 index 00000000..b39785e8 --- /dev/null +++ b/tests/Migration/resources/csv/single_column.csv @@ -0,0 +1,12 @@ +singlecolumn +alpha +beta +gamma +delta +epsilon +zeta +eta +theta +iota +kappa +lambda diff --git a/tests/Migration/resources/csv/tab.csv b/tests/Migration/resources/csv/tab.csv new file mode 100644 index 00000000..59f72aa7 --- /dev/null +++ b/tests/Migration/resources/csv/tab.csv @@ -0,0 +1,11 @@ +id name age city country email phone department role joined +1 Alice 23 New York USA alice@example.com 555-0101 Engineering Developer 2020-01-15 +2 Bob 30 London UK bob@example.com 555-0102 Data Analyst 2019-05-20 +3 Charlie 25 Tokyo Japan charlie@example.com 555-0103 Product Manager 2021-08-10 +4 Diana 28 Sydney Australia diana@example.com 555-0104 Engineering Designer 2020-04-12 +5 Ethan 31 Toronto Canada ethan@example.com 555-0105 Marketing Lead 2018-11-23 +6 Faith 27 Berlin Germany faith@example.com 555-0106 Finance Accountant 2017-07-14 +7 George 35 Paris France george@example.com 555-0107 Legal Advisor 2021-02-05 +8 Hannah 26 Singapore Singapore hannah@example.com 555-0108 Operations Coordinator 2019-09-27 +9 Ian 29 Dubai UAE ian@example.com 555-0109 Engineering QA 2022-03-19 +10 Julia 24 Zurich Switzerland julia@example.com 555-0110 Research Scientist 2020-12-01