]> andersk Git - libyaml.git/blame - src/reader.c
Working on the decoding code.
[libyaml.git] / src / reader.c
CommitLineData
95b98ba9 1
6eb1ded4
KS
2#if HAVE_CONFIG_H
3#include <config.h>
4#endif
5
6#include <yaml/yaml.h>
7
8#include <assert.h>
9
10/* Check for the UTF-16-BE BOM. */
11#define IS_UTF16BE_BOM(pointer) ((pointer)[0] == 0xFE && (pointer)[1] == 0xFF)
12
13/* Check for the UTF-16-LE BOM. */
14#define IS_UTF16LE_BOM(pointer) ((pointer)[0] == 0xFF && (pointer)[1] == 0xFE)
15
16/* Get a UTF-16-BE character. */
17#define UTF16BE_CHAR(pointer) ((pointer)[0] << 8 + (pointer)[1])
18
19/* Get a UTF-16-LE character. */
20#define UTF16LE_CHAR(pointer) ((pointer)[0] + (pointer)[1] << 8)
21
22/*
23 * From http://www.ietf.org/rfc/rfc3629.txt:
24 *
25 * Char. number range | UTF-8 octet sequence
26 * (hexadecimal) | (binary)
27 * --------------------+---------------------------------------------
28 * 0000 0000-0000 007F | 0xxxxxxx
29 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
30 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
31 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
32 */
33
34/* Get the length of a UTF-8 character (0 on error). */
35#define UTF8_LENGTH(pointer) \
36 ((pointer)[0] < 0x80 ? 1 : \
37 (pointer)[0] < 0xC0 ? 0 : \
38 (pointer)[0] < 0xE0 ? 2 : \
39 (pointer)[0] < 0xF0 ? 3 : \
40 (pointer)[0] < 0xF8 ? 4 : 0)
41
42/* Get the value of the first byte of a UTF-8 sequence (0xFF on error). */
43#define UTF8_FIRST_CHUNK(pointer) \
44 ((pointer)[0] < 0x80 ? (pointer)[0] & 0x7F : \
45 (pointer)[0] < 0xC0 ? 0xFF : \
46 (pointer)[0] < 0xE0 ? (pointer)[0] & 0x1F : \
47 (pointer)[0] < 0xF0 ? (pointer)[0] & 0x0F : \
48 (pointer)[0] < 0xF8 ? (pointer)[0] & 0x07 : 0xFF)
49
50/* Get the value of a non-first byte of a UTF-8 sequence (0xFF on error). */
51#define UTF8_NEXT_CHUNK(pointer) \
52 ((pointer)[0] >= 0x80 && (pointer)[0] < 0xC0 ? (pointer)[0] & 0x3F : 0xFF)
53
54/* Determine the length of a UTF-8 character. */
95b98ba9
KS
55
56/*
57 * Ensure that the buffer contains at least length characters.
58 * Return 1 on success, 0 on failure.
6eb1ded4
KS
59 *
60 * The length is supposed to be significantly less that the buffer size.
95b98ba9
KS
61 */
62
63int
6eb1ded4 64yaml_parser_update_buffer(yaml_parser_t *parser, size_t length)
95b98ba9
KS
65{
66 /* If the EOF flag is set, do nothing. */
67
68 if (parser->eof)
69 return 1;
70
6eb1ded4
KS
71 /* Return if the buffer contains enough characters. */
72
73 if (parser->unread >= length)
74 return 1;
75
76 /* Determine the input encoding if it is not known yet. */
95b98ba9 77
6eb1ded4
KS
78 if (!parser->encoding) {
79 if (!yaml_parser_determine_encoding(parser))
95b98ba9 80 return 0;
95b98ba9
KS
81 }
82
6eb1ded4
KS
83 /* Move the unread characters to the beginning of the buffer. */
84
85 if (parser->buffer < parser->pointer
86 && parser->pointer < parser->buffer_end) {
87 size_t size = parser->buffer_end - parser->pointer;
88 memmove(parser->buffer, parser->pointer, size);
89 parser->pointer = parser->buffer;
90 parser->buffer_end -= size;
91 }
92 else if (parser->pointer == parser->buffer_end) {
93 parser->pointer = parser->buffer;
94 parser->buffer_end = parser->buffer;
95 }
96
97 /* Fill the buffer until it has enough characters. */
98
99 while (parser->unread < length)
100 {
101 /* Fill the raw buffer. */
102
103 if (!yaml_parser_update_raw_buffer(parser)) return 0;
104
105 /* Decode the raw buffer. */
106
107 while (parser->raw_unread)
108 {
109 unsigned int ch;
110 int incomplete = 0;
111
112 /* Decode the next character. */
113
114 switch (parser->encoding)
115 {
116 case YAML_UTF8_ENCODING:
117
118 unsigned int utf8_length = UTF8_LENGTH(parser->raw_pointer);
119 unsigned int utf8_chunk;
120
121 /* Check if the raw buffer contains an incomplete character. */
122
123 if (utf8_length > parser->raw_unread) {
124 if (parser->eof) {
125 parser->error = YAML_READER_ERROR;
126 return 0;
127 }
128 incomplete = 1;
129 }
130
131 /* Get the character checking it for validity. */
132
133 utf8_chunk = UTF8_FIRST_CHUNK(parser->raw_pointer ++);
134 if (utf8_chunk == 0xFF) {
135 parser->error = YAML_READER_ERROR;
136 return 0;
137 }
138 ch = utf8_chunk;
139 parser->raw_unread --;
140 while (-- utf8_length) {
141 utf8_chunk = UTF8_NEXT_CHUNK(parser->raw_pointer ++);
142 if (utf8_chunk == 0xFF) {
143 parser->error = YAML_READER_ERROR;
144 return 0;
145 }
146 ch = ch << 6 + utf8_chunk;
147 parser->raw_unread --;
148 }
149
150 break;
151
152 case YAML_UTF16LE_ENCODING:
153
154 /* Check if the raw buffer contains an incomplete character. */
155
156 if (parser->raw_unread < 2) {
157 if (parser->eof) {
158 parser->error = YAML_READER_ERROR;
159 return 0;
160 }
161 incomplete = 1;
162 }
163
164 /* Get the current character. */
165
166 ch = UTF16LE_CHAR(parser->raw_pointer);
167 parser->raw_pointer += 2;
168 parser->raw_unread -= 2;
169
170 break;
171
172 case YAML_UTF16BE_ENCODING:
173
174 /* Check if the raw buffer contains an incomplete character. */
175
176 if (parser->raw_unread < 2) {
177 if (parser->eof) {
178 parser->error = YAML_READER_ERROR;
179 return 0;
180 }
181 incomplete = 1;
182 }
183
184 /* Get the current character. */
185
186 ch = UTF16BE_CHAR(parser->raw_pointer);
187 parser->raw_pointer += 2;
188 parser->raw_unread -= 2;
189
190 break;
191 }
192
193 /*
194 * Check if the character is in the allowed range:
195 * #x9 | #xA | #xD | [#x20-#x7E] (8 bit)
196 * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit)
197 * | [#x10000-#x10FFFF] (32 bit)
198 */
199
200 if (! (ch == 0x09 || ch == 0x0A || ch == 0x0D
201 || (ch >= 0x20 && ch <= 0x7E)
202 || (ch == 0x85) || (ch >= 0xA0 && ch <= 0xD7FF)
203 || (ch >= 0xE000 && ch <= 0xFFFD)
204 || (ch >= 0x10000 && ch <= 0x10FFFF))) {
205 parser->error = YAML_READER_ERROR;
206 return 0;
207 }
208
209 /* Finally put the character into the buffer. */
210
211 /* 0000 0000-0000 007F -> 0xxxxxxx */
212 if (ch <= 0x7F) {
213 *(parser->buffer_end++) = ch;
214 }
215 /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */
216 else if (ch <= 0x7FF) {
217 *(parser->buffer_end++) = 0xC0 + (ch >> 6) & 0x1F;
218 *(parser->buffer_end++) = 0x80 + ch & 0x3F;
219 }
220 /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
221 else if (ch <= 0xFFFF) {
222 *(parser->buffer_end++) = 0x80 + ch & 0x3F;
223 *(parser->buffer_end++) = 0xC0 + (ch >> 6) & 0x1F;
224
225 }
226 /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
227 else {
228 }
95b98ba9 229 }
6eb1ded4 230
95b98ba9
KS
231 }
232
6eb1ded4 233}
95b98ba9 234
6eb1ded4
KS
235/*
236 * Determine the input stream encoding by checking the BOM symbol. If no BOM is
237 * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure.
238 */
239
240int
241yaml_parser_determine_encoding(yaml_parser_t *parser)
242{
243 /* Ensure that we had enough bytes in the raw buffer. */
244
245 while (!parser->eof && parser->raw_unread < 2) {
246 if (!yaml_parser_update_raw_buffer(parser)) {
95b98ba9 247 return 0;
6eb1ded4 248 }
95b98ba9
KS
249 }
250
6eb1ded4 251 /* Determine the encoding. */
95b98ba9 252
6eb1ded4
KS
253 if (parser->raw_unread >= 2 && IS_UTF16BE_BOM(parser->raw_pointer)) {
254 parser->encoding = YAML_UTF16BE_ENCODING;
255 }
256 else if (parser->raw_unread >= 2 && IS_UTF16LE_BOM(parser->raw_pointer)) {
257 parser->encoding = YAML_UTF16LE_ENCODING;
258 }
259 else {
260 parser->encoding = YAML_UTF8_ENCODING;
261 }
95b98ba9
KS
262}
263
6eb1ded4
KS
264/*
265 * Update the raw buffer.
266 */
267
268int
269yaml_parser_update_raw_buffer(yaml_parser_t *parser)
270{
271 size_t size_read = 0;
272
273 /* Return if the raw buffer is full. */
274
275 if (parser->raw_unread == YAML_RAW_BUFFER_SIZE) return 1;
95b98ba9 276
6eb1ded4
KS
277 /* Return on EOF. */
278
279 if (parser->eof) return 1;
280
281 /* Move the remaining bytes in the raw buffer to the beginning. */
282
283 if (parser->raw_unread && parser->raw_buffer < parser->raw_pointer) {
284 memmove(parser->raw_buffer, parser->raw_pointer, parser->raw_unread);
285 }
286 parser->raw_pointer = parser->raw_buffer;
287
288 /* Call the read handler to fill the buffer. */
289
290 if (!parser->read_handler(parser->read_handler_data,
291 parser->raw_buffer + parser->raw_unread,
292 YAML_RAW_BUFFER_SIZE - parser->raw_unread,
293 &size_read)) {
294 parser->error = YAML_READER_ERROR;
295 return 0;
296 }
297 parser->raw_unread += size_read;
298 if (!size_read) {
299 parser->eof = 1;
300 }
301
302 return 1;
303}
95b98ba9 304
This page took 0.08681 seconds and 5 git commands to generate.