Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 118 additions & 4 deletions src/Migration/Sources/CSV.php
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ private function exportRows(int $batchSize): void
}
}

$this->withCSVStream(function ($stream) use ($columnTypes, $manyToManyKeys, $arrayKeys, $table, $batchSize) {
$this->withCSVStream(function ($stream, $delimiter) use ($columnTypes, $manyToManyKeys, $arrayKeys, $table, $batchSize) {
$headers = fgetcsv($stream);
if (! is_array($headers) || count($headers) === 0) {
return;
Expand All @@ -190,7 +190,7 @@ private function exportRows(int $batchSize): void

$buffer = [];

while (($csvRowItem = fgetcsv($stream)) !== false) {
while (($csvRowItem = fgetcsv(stream: $stream, separator: $delimiter)) !== false) {
if (count($csvRowItem) !== count($headers)) {
throw new \Exception('CSV row does not match the number of header columns.');
}
Expand Down Expand Up @@ -314,7 +314,7 @@ protected function exportGroupFunctions(int $batchSize, array $resources): void
}

/**
* @param callable(resource $stream): void $callback
* @param callable(resource $stream, string $delimiter): void $callback
* @return void
* @throws \Exception
*/
Expand All @@ -336,8 +336,10 @@ private function withCsvStream(callable $callback): void
return;
}

$delimiter = $this->delimiter($stream);

try {
$callback($stream);
$callback($stream, $delimiter);
} finally {
\fclose($stream);
}
Expand Down Expand Up @@ -402,4 +404,116 @@ private function downloadToLocal(

$this->downloaded = true;
}

/**
* @param resource $stream
* @return string
*/
private function delimiter($stream): string
{
/**
* widely used options, from here -
*
* https://stackoverflow.com/a/15946087/6819340
*/
$delimiters = [',', ';', "\t", '|'];

$sampleLines = [];

for ($i = 0; $i < 5 && !feof($stream); $i++) {
$line = fgets($stream);
if ($line === false) {
break;
}

$line = trim($line);

// empty line, skip for sampling
if (empty($line)) {
$i--;
continue;
}

$sampleLines[] = $line;
}

/**
* reset to top again because we need to process
* the same file later again if everything goes OK here!
*/
rewind($stream);

if (empty($sampleLines)) {
return ',';
}

$delimiterScores = [];

foreach ($delimiters as $delimiter) {
$columnCounts = [];
$totalFields = 0;
$usableFields = 0;

foreach ($sampleLines as $line) {
// delimiter doesn't exist
if (!str_contains($line, $delimiter)) {
$fields = [$line];
} else {
$fields = str_getcsv($line, $delimiter);
}

$fieldCount = count($fields);
$columnCounts[] = $fieldCount;
$totalFields += $fieldCount;

// Count fields that make some sense i.e.
// longer than 1 char or single alphanumeric
foreach ($fields as $field) {
$trimmed = trim($field);
if (strlen($trimmed) > 1) {
$usableFields++;
}
}
}

$sampleCount = count($columnCounts);
$avgColumns = $totalFields / $sampleCount;

// short-circuit
// if the delimiter doesn't split anything
if ($avgColumns <= 1) {
$delimiterScores[$delimiter] = 0;
continue;
}

// check consistency
if ($sampleCount <= 1) {
$consistencyScore = 1.0;
} else {
$variance = 0;
foreach ($columnCounts as $count) {
$variance += pow($count - $avgColumns, 2);
}

// oof, math!
$stddev = sqrt($variance / $sampleCount);
$coefficientOfVariation = $stddev / $avgColumns;

// lower variance = higher score
$consistencyScore = 1.0 / (1.0 + $coefficientOfVariation * 2);
}

$qualityScore = $totalFields > 0 ? $usableFields / $totalFields : 0.0;

$delimiterScores[$delimiter] = $consistencyScore * $qualityScore;
}

// sort as per score
arsort($delimiterScores);

// get the first
$bestDelimiter = key($delimiterScores);

return ($bestDelimiter && $delimiterScores[$bestDelimiter] > 0) ? $bestDelimiter : ',';
}
}
49 changes: 49 additions & 0 deletions tests/Migration/Unit/General/CSVTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
<?php

namespace Migration\Unit\General;

use PHPUnit\Framework\TestCase;
use Utopia\Migration\Sources\CSV;

class CSVTest extends TestCase
{
private const RESOURCES_DIR = __DIR__ . '/../../resources/csv/';

/**
* @throws \ReflectionException
*/
private function detectDelimiter($stream): string
{
$reflection = new \ReflectionClass(CSV::class);
$instance = $reflection->newInstanceWithoutConstructor();

$refMethod = $reflection->getMethod('delimiter');

/** @noinspection PhpExpressionResultUnusedInspection */
$refMethod->setAccessible(true);

return $refMethod->invoke($instance, $stream);
}

public function testDetectDelimiter()
{
$cases = [
['file' => 'comma.csv', 'expected' => ','],
['file' => 'single_column.csv', 'expected' => ','], // fallback
['file' => 'empty.csv', 'expected' => ','], // fallback
['file' => 'quoted_fields.csv', 'expected' => ','],
['file' => 'semicolon.csv', 'expected' => ';'],
['file' => 'tab.csv', 'expected' => "\t"],
['file' => 'pipe.csv', 'expected' => '|'],
];

foreach ($cases as $case) {
$filepath = self::RESOURCES_DIR . $case['file'];
$stream = fopen($filepath, 'r');
$delimiter = $this->detectDelimiter($stream);
fclose($stream);

$this->assertEquals($case['expected'], $delimiter, "Failed for {$case['file']}");
}
}
}
11 changes: 11 additions & 0 deletions tests/Migration/resources/csv/comma.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
id,name,age,city,country,email,phone,department,role,joined
1,Alice,23,New York,USA,alice@example.com,555-0101,Engineering,Developer,2020-01-15
2,Bob,30,London,UK,bob@example.com,555-0102,Data,Analyst,2019-05-20
3,Charlie,25,Tokyo,Japan,charlie@example.com,555-0103,Product,Manager,2021-08-10
4,Diana,28,Sydney,Australia,diana@example.com,555-0104,Engineering,Designer,2020-04-12
5,Ethan,31,Toronto,Canada,ethan@example.com,555-0105,Marketing,Lead,2018-11-23
6,Faith,27,Berlin,Germany,faith@example.com,555-0106,Finance,Accountant,2017-07-14
7,George,35,Paris,France,george@example.com,555-0107,Legal,Advisor,2021-02-05
8,Hannah,26,Singapore,Singapore,hannah@example.com,555-0108,Operations,Coordinator,2019-09-27
9,Ian,29,Dubai,UAE,ian@example.com,555-0109,Engineering,QA,2022-03-19
10,Julia,24,Zurich,Switzerland,julia@example.com,555-0110,Research,Scientist,2020-12-01
Empty file.
11 changes: 11 additions & 0 deletions tests/Migration/resources/csv/pipe.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
id|name|age|city|country|email|phone|department|role|joined
1|Alice|23|New York|USA|alice@example.com|555-0101|Engineering|Developer|2020-01-15
2|Bob|30|London|UK|bob@example.com|555-0102|Data|Analyst|2019-05-20
3|Charlie|25|Tokyo|Japan|charlie@example.com|555-0103|Product|Manager|2021-08-10
4|Diana|28|Sydney|Australia|diana@example.com|555-0104|Engineering|Designer|2020-04-12
5|Ethan|31|Toronto|Canada|ethan@example.com|555-0105|Marketing|Lead|2018-11-23
6|Faith|27|Berlin|Germany|faith@example.com|555-0106|Finance|Accountant|2017-07-14
7|George|35|Paris|France|george@example.com|555-0107|Legal|Advisor|2021-02-05
8|Hannah|26|Singapore|Singapore|hannah@example.com|555-0108|Operations|Coordinator|2019-09-27
9|Ian|29|Dubai|UAE|ian@example.com|555-0109|Engineering|QA|2022-03-19
10|Julia|24|Zurich|Switzerland|julia@example.com|555-0110|Research|Scientist|2020-12-01
11 changes: 11 additions & 0 deletions tests/Migration/resources/csv/quoted_fields.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name,description,tags
"Alice","Software Engineer, Team Lead","php,js"
"Bob","Data Analyst; Expert","sql,excel"
"Carol","CTO, Strategic Leader","go,devops,cloud"
"David","Back-end Developer, API specialist","python,flask,django"
"Emma","Fullstack Developer, React/Vue","js,react,vue"
"Frank","DevOps; Site Reliability","docker,k8s,terraform"
"Grace","Product Manager, B2B; B2C","planning,roadmap"
"Hannah","Support Lead, ""Customer Success""","support,crm"
"Ian","QA Engineer, Manual & Automated testing","selenium,pytest"
"Jane","UI/UX Designer, accessibility","figma,sketch"
11 changes: 11 additions & 0 deletions tests/Migration/resources/csv/semicolon.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
id;name;age;city;country;email;phone;department;role;joined
1;Alice;23;New York;USA;alice@example.com;555-0101;Engineering;Developer;2020-01-15
2;Bob;30;London;UK;bob@example.com;555-0102;Data;Analyst;2019-05-20
3;Charlie;25;Tokyo;Japan;charlie@example.com;555-0103;Product;Manager;2021-08-10
4;Diana;28;Sydney;Australia;diana@example.com;555-0104;Engineering;Designer;2020-04-12
5;Ethan;31;Toronto;Canada;ethan@example.com;555-0105;Marketing;Lead;2018-11-23
6;Faith;27;Berlin;Germany;faith@example.com;555-0106;Finance;Accountant;2017-07-14
7;George;35;Paris;France;george@example.com;555-0107;Legal;Advisor;2021-02-05
8;Hannah;26;Singapore;Singapore;hannah@example.com;555-0108;Operations;Coordinator;2019-09-27
9;Ian;29;Dubai;UAE;ian@example.com;555-0109;Engineering;QA;2022-03-19
10;Julia;24;Zurich;Switzerland;julia@example.com;555-0110;Research;Scientist;2020-12-01
12 changes: 12 additions & 0 deletions tests/Migration/resources/csv/single_column.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
singlecolumn
alpha
beta
gamma
delta
epsilon
zeta
eta
theta
iota
kappa
lambda
11 changes: 11 additions & 0 deletions tests/Migration/resources/csv/tab.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
id name age city country email phone department role joined
1 Alice 23 New York USA alice@example.com 555-0101 Engineering Developer 2020-01-15
2 Bob 30 London UK bob@example.com 555-0102 Data Analyst 2019-05-20
3 Charlie 25 Tokyo Japan charlie@example.com 555-0103 Product Manager 2021-08-10
4 Diana 28 Sydney Australia diana@example.com 555-0104 Engineering Designer 2020-04-12
5 Ethan 31 Toronto Canada ethan@example.com 555-0105 Marketing Lead 2018-11-23
6 Faith 27 Berlin Germany faith@example.com 555-0106 Finance Accountant 2017-07-14
7 George 35 Paris France george@example.com 555-0107 Legal Advisor 2021-02-05
8 Hannah 26 Singapore Singapore hannah@example.com 555-0108 Operations Coordinator 2019-09-27
9 Ian 29 Dubai UAE ian@example.com 555-0109 Engineering QA 2022-03-19
10 Julia 24 Zurich Switzerland julia@example.com 555-0110 Research Scientist 2020-12-01