Skip to content

Commit 8d7b5f8

Browse files
authored
[Blueprints] Support multiline SQL queries in the runSql step (#2928)
Adapts [WP_MySQL_Naive_Query_Stream](WordPress/sqlite-database-integration#264) to support multiline SQL queries in the `runSql` step. With this PR, the following call works: ```ts await runSql(php, { sql: new File( [ `SELECT * FROM wp_users -- users table ; SELECT * FROM wp_posts;` ] 'no-trailing-newline.sql' ), }); ``` Whereas before this PR, the `runSql` step assumed every line of a SQL file is a separate query and would fail on the above call. ## Implementation details See WordPress/sqlite-database-integration#264. Tl;dr we tokenize the query and treat `;` and EOF tokens as query separators. The stream is only "naive" in that every query must be smaller than 15MB. It might fail for some very large WordPress posts, but should work most of the time. Once the lexer provides an explicit distinction between syntax errors and incomplete input, we'll be able to support arbitrarily large queries. ## Testing Instructions (or ideally a Blueprint) Tests have been updated to verify multiline query handling, SQL comment preservation, and queries with subqueries. The streaming parser correctly handles edge cases like empty lines, semicolon-only lines, and queries split across chunk boundaries. cc @JanJakes
1 parent 5ce5752 commit 8d7b5f8

File tree

7 files changed

+344
-44
lines changed

7 files changed

+344
-44
lines changed

packages/php-wasm/universal/src/lib/fs-helpers.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ export class FSHelpers {
5959
static writeFile(
6060
FS: Emscripten.RootFS,
6161
path: string,
62-
data: string | Uint8Array
62+
data: string | Uint8Array | Buffer
6363
) {
6464
FS.writeFile(path, data);
6565
}

packages/php-wasm/universal/src/lib/php.ts

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ export class PHP implements Disposable {
285285
// Always enable the file cache.
286286
'opcache.file_cache_only = 1',
287287
'opcache.file_cache_consistency_checks = 1',
288-
]
288+
]
289289
: [];
290290

291291
/*if (
@@ -508,9 +508,8 @@ export class PHP implements Disposable {
508508
*/
509509
async run(request: PHPRunOptions): Promise<PHPResponse> {
510510
const streamedResponse = await this.runStream(request);
511-
const syncResponse = await PHPResponse.fromStreamedResponse(
512-
streamedResponse
513-
);
511+
const syncResponse =
512+
await PHPResponse.fromStreamedResponse(streamedResponse);
514513

515514
if (syncResponse.exitCode !== 0) {
516515
// Legacy run() behavior: throw if PHP exited with a non-zero exit code.
@@ -1189,7 +1188,7 @@ export class PHP implements Disposable {
11891188
* @param path - The file path to write to.
11901189
* @param data - The data to write to the file.
11911190
*/
1192-
writeFile(path: string, data: string | Uint8Array) {
1191+
writeFile(path: string, data: string | Uint8Array | Buffer) {
11931192
const result = FSHelpers.writeFile(
11941193
this[__private__dont__use].FS,
11951194
path,
@@ -1754,9 +1753,9 @@ const getNodeType = (fs: Emscripten.FileSystemInstance, path: string) => {
17541753
return 'contents' in target.node
17551754
? 'memfs'
17561755
: /**
1757-
* Could be NODEFS, PROXYFS, etc.
1758-
*/
1759-
'not-memfs';
1756+
* Could be NODEFS, PROXYFS, etc.
1757+
*/
1758+
'not-memfs';
17601759
} catch {
17611760
return 'missing';
17621761
}
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
<?php
2+
3+
/**
4+
* Naively splits an SQL string into a sequence of queries. It
5+
* streams the data so you can process very large chunks of SQL
6+
* without running out of memory.
7+
*
8+
* This class is **naive** because it doesn't understand what a
9+
* valid query is. The lexer does not provide a way to distinguish
10+
* between a syntax error and an incomplete input yet. Lacking this
11+
* information, we assume that no SQL query is larger than 2MB and,
12+
* failing to extract a query from a 2MB buffer, we fail. This heuristic
13+
* is often sufficient, but may fail in pathological cases.
14+
*
15+
* Usage:
16+
*
17+
* $stream = new WP_MySQL_Naive_Query_Stream();
18+
* $stream->append_sql( 'SELECT id FROM users; SELECT * FROM posts;' );
19+
* while ( $stream->next_query() ) {
20+
* $sql_string = $stream->get_query();
21+
* // Process the query.
22+
* }
23+
* $stream->append_sql( 'CREATE TABLE users (id INT, name VARCHAR(255));' );
24+
* while ( $stream->next_query() ) {
25+
* $sql_string = $stream->get_query();
26+
* // Process the query.
27+
* }
28+
* $stream->mark_input_complete();
29+
* $stream->next_query(); // returns false
30+
*/
31+
class WP_MySQL_Naive_Query_Stream {
32+
33+
private $sql_buffer = '';
34+
private $input_complete = false;
35+
private $state = true;
36+
private $last_query = false;
37+
38+
const STATE_QUERY = 'valid';
39+
const STATE_SYNTAX_ERROR = 'syntax_error';
40+
const STATE_PAUSED_ON_INCOMPLETE_INPUT = 'paused_on_incomplete_input';
41+
const STATE_FINISHED = 'finished';
42+
43+
/**
44+
* The maximum size of the buffer to store the SQL input. We don't
45+
* have enough information from the lexer to distinguish between
46+
* an incomplete input and a syntax error so we use a heuristic –
47+
* if we've accumulated more than this amount of SQL input, we assume
48+
* it's a syntax error. That's why this class is called a "naive" query
49+
* stream.
50+
*/
51+
const MAX_SQL_BUFFER_SIZE = 1024 * 1024 * 15;
52+
53+
public function __construct() {}
54+
55+
public function append_sql( string $sql ) {
56+
if($this->input_complete) {
57+
return false;
58+
}
59+
$this->sql_buffer .= $sql;
60+
$this->state = self::STATE_QUERY;
61+
return true;
62+
}
63+
64+
public function is_paused_on_incomplete_input(): bool {
65+
return $this->state === self::STATE_PAUSED_ON_INCOMPLETE_INPUT;
66+
}
67+
68+
public function mark_input_complete() {
69+
$this->input_complete = true;
70+
}
71+
72+
public function next_query() {
73+
$this->last_query = false;
74+
if($this->state === self::STATE_PAUSED_ON_INCOMPLETE_INPUT) {
75+
return false;
76+
}
77+
78+
$result = $this->do_next_query();
79+
if(!$result && strlen($this->sql_buffer) > self::MAX_SQL_BUFFER_SIZE) {
80+
$this->state = self::STATE_SYNTAX_ERROR;
81+
return false;
82+
}
83+
return $result;
84+
}
85+
86+
private function do_next_query() {
87+
$query = [];
88+
$lexer = new WP_MySQL_Lexer( $this->sql_buffer );
89+
while ( $lexer->next_token() ) {
90+
$token = $lexer->get_token();
91+
$query[] = $token;
92+
if ( $token->id === WP_MySQL_Lexer::SEMICOLON_SYMBOL ) {
93+
// Got a complete query!
94+
break;
95+
}
96+
}
97+
98+
// @TODO: expose this method from the lexer
99+
// if($lexer->get_state() === WP_MySQL_Lexer::STATE_SYNTAX_ERROR) {
100+
// return false;
101+
// }
102+
103+
if(!count($query)) {
104+
if ( $this->input_complete ) {
105+
$this->state = self::STATE_FINISHED;
106+
} else {
107+
$this->state = self::STATE_PAUSED_ON_INCOMPLETE_INPUT;
108+
}
109+
return false;
110+
}
111+
112+
// The last token either needs to end with a semicolon, or be the
113+
// last token in the input.
114+
$last_token = $query[count($query) - 1];
115+
if (
116+
$last_token->id !== WP_MySQL_Lexer::SEMICOLON_SYMBOL &&
117+
! $this->input_complete
118+
) {
119+
$this->state = self::STATE_PAUSED_ON_INCOMPLETE_INPUT;
120+
return false;
121+
}
122+
123+
// See if the query has any meaningful tokens. We don't want to return
124+
// to give the caller a comment disguised as a query.
125+
$has_meaningful_tokens = false;
126+
foreach($query as $token) {
127+
if (
128+
$token->id !== WP_MySQL_Lexer::WHITESPACE &&
129+
$token->id !== WP_MySQL_Lexer::COMMENT &&
130+
$token->id !== WP_MySQL_Lexer::MYSQL_COMMENT_START &&
131+
$token->id !== WP_MySQL_Lexer::MYSQL_COMMENT_END &&
132+
$token->id !== WP_MySQL_Lexer::EOF
133+
) {
134+
$has_meaningful_tokens = true;
135+
break;
136+
}
137+
}
138+
if(!$has_meaningful_tokens) {
139+
if ( $this->input_complete ) {
140+
$this->state = self::STATE_FINISHED;
141+
} else {
142+
$this->state = self::STATE_PAUSED_ON_INCOMPLETE_INPUT;
143+
}
144+
return false;
145+
}
146+
147+
// Remove the query from the input buffer and return it.
148+
$last_byte = $last_token->start + $last_token->length;
149+
$query = substr($this->sql_buffer, 0, $last_byte);
150+
$this->sql_buffer = substr($this->sql_buffer, $last_byte);
151+
$this->last_query = $query;
152+
$this->state = self::STATE_QUERY;
153+
return true;
154+
}
155+
156+
public function get_query() {
157+
return $this->last_query;
158+
}
159+
160+
public function get_state() {
161+
return $this->state;
162+
}
163+
164+
}

0 commit comments

Comments
 (0)