Skip to content

Commit 5096d5d

Browse files
committed
Care null character
1 parent 7b02e9c commit 5096d5d

File tree

2 files changed

+122
-76
lines changed

2 files changed

+122
-76
lines changed

src/lua-urlencode.c

Lines changed: 121 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
#include "lua-urlencode.h"
22

3-
static const char xdigit[16] = "0123456789ABCDEF";
3+
#define UTF8_LEAD(c) ((uint8_t)(c) < 0x80 || ((uint8_t)(c) > 0xC1 && (uint8_t)(c) < 0xF5))
4+
#define UTF8_TRAIL(c) (((uint8_t)(c) & 0xC0) == 0x80)
5+
6+
uint8_t utf8_len(const uint8_t* str);
7+
8+
static const uint8_t xdigit[16] = "0123456789ABCDEF";
49
static const int url_unreserved[256] = {
510
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x00-0x0F */
611
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x10-0x1F */
@@ -20,42 +25,27 @@ static const int url_unreserved[256] = {
2025
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */
2126
};
2227

23-
typedef struct {
24-
char* got;
25-
char* err;
26-
} urlencode_result_t;
27-
28-
static urlencode_result_t _encode_url(const unsigned char* input, const char* encoding) {
29-
if (setlocale(LC_CTYPE, encoding) == NULL) {
30-
const char* errmsg = "failed to configure the locale";
31-
printf("%s\n", errmsg);
32-
const urlencode_result_t result = {"", errmsg};
33-
return result;
28+
static uint8_t* _encode_url(const uint8_t* input) {
29+
if (input[0] == '\0') {
30+
return "";
3431
}
3532

3633
const long len = strlen(input);
37-
const char* endmarker = input + len;
38-
char* encoded;
39-
encoded = (char*)malloc(sizeof(char) * len * 3 + 1);
34+
const uint8_t* endmarker = input + len;
35+
uint8_t* encoded;
36+
encoded = (uint8_t*)malloc(sizeof(uint8_t) * len * 3 + 1);
4037

4138
int in_cursor = 0;
4239
int out_cursor = 0;
43-
int char_count = 0;
4440
while (input[in_cursor] != '\0') {
45-
int charlen = mblen(&input[in_cursor], MB_CUR_MAX);
46-
if (charlen < 0) {
47-
const char* errmsg = "invalid character has come";
48-
printf("%s\n", errmsg);
49-
const urlencode_result_t result = {"", errmsg};
50-
return result;
51-
}
41+
const uint8_t charlen = utf8_len(&input[in_cursor]);
5242

5343
if (charlen == 0) {
5444
continue;
5545
}
5646

5747
if (charlen <= 1) {
58-
const unsigned char c = input[in_cursor];
48+
const uint8_t c = input[in_cursor];
5949
in_cursor += charlen;
6050
if (url_unreserved[c]) {
6151
encoded[out_cursor++] = c;
@@ -70,33 +60,22 @@ static urlencode_result_t _encode_url(const unsigned char* input, const char* en
7060
}
7161

7262
for (int i = 0; i < charlen; i++, in_cursor++) {
73-
const unsigned char c = input[in_cursor];
63+
const uint8_t c = input[in_cursor];
7464
encoded[out_cursor++] = '%';
7565
encoded[out_cursor++] = xdigit[c >> 4];
7666
encoded[out_cursor++] = xdigit[c & 15];
7767
}
7868
}
7969
encoded[out_cursor] = '\0';
8070

81-
const urlencode_result_t result = {encoded, ""};
82-
return result;
71+
return encoded;
8372
}
8473

8574
static int encode_url (lua_State* L) {
86-
const unsigned char* input = luaL_checkstring(L, 1);
87-
const unsigned char* encoding = luaL_checkstring(L, 2);
88-
const urlencode_result_t result = _encode_url(input, encoding);
89-
90-
lua_pushstring(L, result.got);
91-
92-
const char* err = result.err;
93-
if (strlen(err) <= 0) {
94-
lua_pushnil(L);
95-
} else {
96-
lua_pushstring(L, err);
97-
}
98-
99-
return 2;
75+
const uint8_t* input = luaL_checkstring(L, 1);
76+
const uint8_t* encoded = _encode_url(input);
77+
lua_pushstring(L, encoded);
78+
return 1;
10079
}
10180

10281
#define __ 256
@@ -120,37 +99,23 @@ static const int hexval[256] = {
12099
};
121100
#undef __
122101

123-
static urlencode_result_t _decode_url(const unsigned char* input, const char* encoding) {
124-
if (setlocale(LC_CTYPE, encoding) == NULL) {
125-
const char* errmsg = "failed to configure the locale";
126-
printf("%s\n", errmsg);
127-
const urlencode_result_t result = {"", errmsg};
128-
return result;
129-
}
130-
102+
static uint8_t* _decode_url(const uint8_t* input) {
131103
const long len = strlen(input);
132-
const char* endmarker = input + len;
133-
char* decoded;
134-
decoded = (char*)malloc(sizeof(char) * len + 1);
104+
const uint8_t* endmarker = input + len;
105+
uint8_t* decoded;
106+
decoded = (uint8_t*)malloc(sizeof(uint8_t) * len + 1);
135107

136108
int in_cursor = 0;
137109
int out_cursor = 0;
138-
int char_count = 0;
139110
while (input[in_cursor] != '\0') {
140-
int charlen = mblen(&input[in_cursor], MB_CUR_MAX);
141-
if (charlen < 0) {
142-
const char* errmsg = "invalid character has come";
143-
printf("%s\n", errmsg);
144-
const urlencode_result_t result = {"", errmsg};
145-
return result;
146-
}
111+
const uint8_t charlen = utf8_len(&input[in_cursor]);
147112

148113
if (charlen == 0) {
149114
continue;
150115
}
151116

152117
if (charlen <= 1) {
153-
const unsigned char c = input[in_cursor++];
118+
const uint8_t c = input[in_cursor++];
154119

155120
if (c == '+') {
156121
decoded[out_cursor++] = ' ';
@@ -162,8 +127,15 @@ static urlencode_result_t _decode_url(const unsigned char* input, const char* en
162127
continue;
163128
}
164129

165-
const unsigned int v1 = hexval[(unsigned int)input[in_cursor++]];
166-
const unsigned int v2 = hexval[(unsigned int)input[in_cursor++]];
130+
const unsigned int v1raw = input[in_cursor++];
131+
const unsigned int v2raw = input[in_cursor++];
132+
if (v1raw == 0x30 && v2raw == 0x30) {
133+
// null char termination (%00)
134+
return decoded;
135+
}
136+
137+
const unsigned int v1 = hexval[v1raw];
138+
const unsigned int v2 = hexval[v2raw];
167139
if ((v1 | v2) != 0xFF) {
168140
decoded[out_cursor++] = (v1 << 4) | v2;
169141
continue;
@@ -175,31 +147,103 @@ static urlencode_result_t _decode_url(const unsigned char* input, const char* en
175147
}
176148

177149
for (int i = 0; i < charlen; i++, in_cursor++) {
178-
const unsigned char c = input[in_cursor];
150+
const uint8_t c = input[in_cursor];
179151
decoded[out_cursor++] = c;
180152
}
181153
}
182154
decoded[out_cursor] = '\0';
183155

184-
const urlencode_result_t result = {decoded, ""};
185-
return result;
156+
return decoded;
186157
}
187158

188159
static int decode_url (lua_State* L) {
189-
const unsigned char* input = luaL_checkstring(L, 1);
190-
const unsigned char* encoding = luaL_checkstring(L, 2);
191-
const urlencode_result_t result = _decode_url(input, encoding);
160+
const uint8_t* input = luaL_checkstring(L, 1);
161+
const uint8_t* decoded = _decode_url(input);
162+
163+
lua_pushstring(L, decoded);
164+
165+
return 1;
166+
}
167+
168+
#define __ 0xFF
169+
/*
170+
* 0x00: 0
171+
* 0x01-0xC1: 1
172+
* 0xF5-: 1
173+
*/
174+
static const uint8_t utf8_immediate_len[256] = {
175+
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00-0x0F */
176+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x10-0x1F */
177+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x20-0x2F */
178+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x30-0x3F */
179+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40-0x4F */
180+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x50-0x5F */
181+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60-0x6F */
182+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x70-0x7F */
183+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x80-0x8F */
184+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x90-0x9F */
185+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xA0-0xAF */
186+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xB0-0xBF */
187+
1, 1,__,__,__,__,__,__,__,__,__,__,__,__,__,__, /* 0xC0-0xCF */
188+
__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, /* 0xD0-0xDF */
189+
__,__,__,__,__,__,__,__,__,__,__,__,__,__,__,__, /* 0xE0-0xEF */
190+
__,__,__,__,__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xF0-0xFF */
191+
};
192+
#undef __
193+
194+
/*
195+
* 0xC2-0xDF: 2
196+
* 0xE0-0xEF: 3
197+
* 0xF0-0xF4: 4
198+
*/
199+
static const uint8_t utf8_count_len[256] = {
200+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x00-0x0F */
201+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x10-0x1F */
202+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x20-0x2F */
203+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x30-0x3F */
204+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x40-0x4F */
205+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x50-0x5F */
206+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x60-0x6F */
207+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x70-0x7F */
208+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */
209+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */
210+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */
211+
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */
212+
0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */
213+
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */
214+
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */
215+
4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */
216+
};
192217

193-
lua_pushstring(L, result.got);
218+
uint8_t utf8_len(const uint8_t* str) {
219+
const uint8_t lead = *str;
194220

195-
const char* err = result.err;
196-
if (strlen(err) <= 0) {
197-
lua_pushnil(L);
198-
} else {
199-
lua_pushstring(L, err);
221+
const uint8_t immediate_len = utf8_immediate_len[lead];
222+
if (immediate_len != 0xFF) {
223+
return immediate_len;
200224
}
201225

202-
return 2;
226+
const uint8_t count = utf8_count_len[lead];
227+
uint8_t trail = *(++str);
228+
229+
if (count == 3) {
230+
if ((lead == 0xE0 && 0xA0 > trail) || (lead == 0xED && trail > 0x9F)) {
231+
return 1;
232+
}
233+
} else if (count == 4) {
234+
if ((lead == 0xF0 && 0x90 > trail) || (lead == 0xF4 && trail > 0x8F)) {
235+
return 1;
236+
}
237+
}
238+
239+
uint8_t size = 1;
240+
for (; size < count; ++size) {
241+
if (!UTF8_TRAIL(trail)) {
242+
return size;
243+
}
244+
trail = *(++str);
245+
}
246+
return size;
203247
}
204248

205249
static const struct luaL_Reg R[] = {
@@ -241,3 +285,4 @@ LUALIB_API int luaopen_urlencode(lua_State * L) {
241285
#ifdef __cplusplus
242286
}
243287
#endif
288+

src/lua-urlencode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ extern "C" {
1616
#include <stdlib.h>
1717
#include <string.h>
1818
#include <locale.h>
19+
#include <stdint.h>
1920

2021
#define URL_ENCODE_VERSION "0.0.1-0"
2122
#define URL_ENCODE_LIBNAME "urlencode"

0 commit comments

Comments
 (0)