|
|
xml.c - xml2tsv - a simple xml-to-tsv converter, based on xmlparser |
|
|
 |
Log |
|
|
 |
Files |
|
|
 |
Refs |
|
|
 |
Tags |
|
|
 |
README |
|
|
 |
LICENSE |
|
|
|
--- |
|
|
|
xml.c (10070B) |
|
|
|
--- |
|
|
|
1 #include <ctype.h> |
|
|
|
2 #include <errno.h> |
|
|
|
3 #include <stdio.h> |
|
|
|
4 #include <stdlib.h> |
|
|
|
5 #include <string.h> |
|
|
|
6 |
|
|
|
7 #include "xml.h" |
|
|
|
8 |
|
|
|
9 static void |
|
|
|
10 xml_parseattrs(XMLParser *x) |
|
|
|
11 { |
|
|
|
12 size_t namelen = 0, valuelen; |
|
|
|
13 int c, endsep, endname = 0, valuestart = 0; |
|
|
|
14 |
|
|
|
15 while ((c = GETNEXT()) != EOF) { |
|
|
|
16 if (isspace(c)) { |
|
|
|
17 if (namelen) |
|
|
|
18 endname = 1; |
|
|
|
19 continue; |
|
|
|
20 } else if (c == '?') |
|
|
|
21 ; /* ignore */ |
|
|
|
22 else if (c == '=') { |
|
|
|
23 x->name[namelen] = '\0'; |
|
|
|
24 valuestart = 1; |
|
|
|
25 endname = 1; |
|
|
|
26 } else if (namelen && ((endname && !valuestart && isalpha(c)) || (c == '>' || c == '/'))) { |
|
|
|
27 /* attribute without value */ |
|
|
|
28 x->name[namelen] = '\0'; |
|
|
|
29 if (x->xmlattrstart) |
|
|
|
30 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); |
|
|
|
31 if (x->xmlattr) |
|
|
|
32 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, "", 0); |
|
|
|
33 if (x->xmlattrend) |
|
|
|
34 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); |
|
|
|
35 endname = 0; |
|
|
|
36 x->name[0] = c; |
|
|
|
37 namelen = 1; |
|
|
|
38 } else if (namelen && valuestart) { |
|
|
|
39 /* attribute with value */ |
|
|
|
40 if (x->xmlattrstart) |
|
|
|
41 x->xmlattrstart(x, x->tag, x->taglen, x->name, namelen); |
|
|
|
42 |
|
|
|
43 valuelen = 0; |
|
|
|
44 if (c == '\'' || c == '"') { |
|
|
|
45 endsep = c; |
|
|
|
46 } else { |
|
|
|
47 endsep = ' '; /* isspace() */ |
|
|
|
48 goto startvalue; |
|
|
|
49 } |
|
|
|
50 |
|
|
|
51 while ((c = GETNEXT()) != EOF) { |
|
|
|
52 startvalue: |
|
|
|
53 if (c == '&') { /* entities */ |
|
|
|
54 x->data[valuelen] = '\0'; |
|
|
|
55 /* call data function with data before entity if there is data */ |
|
|
|
56 if (valuelen && x->xmlattr) |
|
|
|
57 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); |
|
|
|
58 x->data[0] = c; |
|
|
|
59 valuelen = 1; |
|
|
|
60 while ((c = GETNEXT()) != EOF) { |
|
|
|
61 if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) |
|
|
|
62 break; |
|
|
|
63 if (valuelen < sizeof(x->data) - 1) |
|
|
|
64 x->data[valuelen++] = c; |
|
|
|
65 else { |
|
|
|
66 /* entity too long for buffer, handle as normal data */ |
|
|
|
67 x->data[valuelen] = '\0'; |
|
|
|
68 if (x->xmlattr) |
|
|
|
69 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); |
|
|
|
70 x->data[0] = c; |
|
|
|
71 valuelen = 1; |
|
|
|
72 break; |
|
|
|
73 } |
|
|
|
74 if (c == ';') { |
|
|
|
75 x->data[valuelen] = '\0'; |
|
|
|
76 if (x->xmlattrentity) |
|
|
|
77 x->xmlattrentity(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); |
|
|
|
78 valuelen = 0; |
|
|
|
79 break; |
|
|
|
80 } |
|
|
|
81 } |
|
|
|
82 } else if (c != endsep && !(endsep == ' ' && (c == '>' || isspace(c)))) { |
|
|
|
83 if (valuelen < sizeof(x->data) - 1) { |
|
|
|
84 x->data[valuelen++] = c; |
|
|
|
85 } else { |
|
|
|
86 x->data[valuelen] = '\0'; |
|
|
|
87 if (x->xmlattr) |
|
|
|
88 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); |
|
|
|
89 x->data[0] = c; |
|
|
|
90 valuelen = 1; |
|
|
|
91 } |
|
|
|
92 } |
|
|
|
93 if (c == endsep || (endsep == ' ' && (c == '>' || isspace(c)))) { |
|
|
|
94 x->data[valuelen] = '\0'; |
|
|
|
95 if (x->xmlattr) |
|
|
|
96 x->xmlattr(x, x->tag, x->taglen, x->name, namelen, x->data, valuelen); |
|
|
|
97 if (x->xmlattrend) |
|
|
|
98 x->xmlattrend(x, x->tag, x->taglen, x->name, namelen); |
|
|
|
99 break; |
|
|
|
100 } |
|
|
|
101 } |
|
|
|
102 namelen = endname = valuestart = 0; |
|
|
|
103 } else if (namelen < sizeof(x->name) - 1) { |
|
|
|
104 x->name[namelen++] = c; |
|
|
|
105 } |
|
|
|
106 if (c == '>') { |
|
|
|
107 break; |
|
|
|
108 } else if (c == '/') { |
|
|
|
109 x->isshorttag = 1; |
|
|
|
110 x->name[0] = '\0'; |
|
|
|
111 namelen = 0; |
|
|
|
112 } |
|
|
|
113 } |
|
|
|
114 } |
|
|
|
115 |
|
|
|
116 static void |
|
|
|
117 xml_parsecomment(XMLParser *x) |
|
|
|
118 { |
|
|
|
119 size_t i = 0; |
|
|
|
120 int c; |
|
|
|
121 |
|
|
|
122 while ((c = GETNEXT()) != EOF) { |
|
|
|
123 if (c == '-') { |
|
|
|
124 if (++i > 2) |
|
|
|
125 i = 2; |
|
|
|
126 continue; |
|
|
|
127 } else if (c == '>' && i == 2) { |
|
|
|
128 return; |
|
|
|
129 } else if (i) { |
|
|
|
130 i = 0; |
|
|
|
131 } |
|
|
|
132 } |
|
|
|
133 } |
|
|
|
134 |
|
|
|
135 static void |
|
|
|
136 xml_parsecdata(XMLParser *x) |
|
|
|
137 { |
|
|
|
138 size_t datalen = 0, i = 0; |
|
|
|
139 int c; |
|
|
|
140 |
|
|
|
141 if (x->xmlcdatastart) |
|
|
|
142 x->xmlcdatastart(x); |
|
|
|
143 while ((c = GETNEXT()) != EOF) { |
|
|
|
144 if (c == ']' || c == '>') { |
|
|
|
145 if (x->xmlcdata && datalen) { |
|
|
|
146 x->data[datalen] = '\0'; |
|
|
|
147 x->xmlcdata(x, x->data, datalen); |
|
|
|
148 datalen = 0; |
|
|
|
149 } |
|
|
|
150 } |
|
|
|
151 |
|
|
|
152 if (c == ']') { |
|
|
|
153 if (++i > 2) { |
|
|
|
154 if (x->xmlcdata) |
|
|
|
155 for (; i > 2; i--) |
|
|
|
156 x->xmlcdata(x, "]", 1); |
|
|
|
157 i = 2; |
|
|
|
158 } |
|
|
|
159 continue; |
|
|
|
160 } else if (c == '>' && i == 2) { |
|
|
|
161 if (x->xmlcdataend) |
|
|
|
162 x->xmlcdataend(x); |
|
|
|
163 return; |
|
|
|
164 } else if (i) { |
|
|
|
165 if (x->xmlcdata) |
|
|
|
166 for (; i > 0; i--) |
|
|
|
167 x->xmlcdata(x, "]", 1); |
|
|
|
168 i = 0; |
|
|
|
169 } |
|
|
|
170 |
|
|
|
171 if (datalen < sizeof(x->data) - 1) { |
|
|
|
172 x->data[datalen++] = c; |
|
|
|
173 } else { |
|
|
|
174 x->data[datalen] = '\0'; |
|
|
|
175 if (x->xmlcdata) |
|
|
|
176 x->xmlcdata(x, x->data, datalen); |
|
|
|
177 x->data[0] = c; |
|
|
|
178 datalen = 1; |
|
|
|
179 } |
|
|
|
180 } |
|
|
|
181 } |
|
|
|
182 |
|
|
|
183 static int |
|
|
|
184 codepointtoutf8(long r, char *s) |
|
|
|
185 { |
|
|
|
186 if (r == 0) { |
|
|
|
187 return 0; /* NUL byte */ |
|
|
|
188 } else if (r <= 0x7F) { |
|
|
|
189 /* 1 byte: 0aaaaaaa */ |
|
|
|
190 s[0] = r; |
|
|
|
191 return 1; |
|
|
|
192 } else if (r <= 0x07FF) { |
|
|
|
193 /* 2 bytes: 00000aaa aabbbbbb */ |
|
|
|
194 s[0] = 0xC0 | ((r & 0x0007C0) >> 6); /* 110aaaaa */ |
|
|
|
195 s[1] = 0x80 | (r & 0x00003F); /* 10bbbbbb */ |
|
|
|
196 return 2; |
|
|
|
197 } else if (r <= 0xFFFF) { |
|
|
|
198 /* 3 bytes: aaaabbbb bbcccccc */ |
|
|
|
199 s[0] = 0xE0 | ((r & 0x00F000) >> 12); /* 1110aaaa */ |
|
|
|
200 s[1] = 0x80 | ((r & 0x000FC0) >> 6); /* 10bbbbbb */ |
|
|
|
201 s[2] = 0x80 | (r & 0x00003F); /* 10cccccc */ |
|
|
|
202 return 3; |
|
|
|
203 } else { |
|
|
|
204 /* 4 bytes: 000aaabb bbbbcccc ccdddddd */ |
|
|
|
205 s[0] = 0xF0 | ((r & 0x1C0000) >> 18); /* 11110aaa */ |
|
|
|
206 s[1] = 0x80 | ((r & 0x03F000) >> 12); /* 10bbbbbb */ |
|
|
|
207 s[2] = 0x80 | ((r & 0x000FC0) >> 6); /* 10cccccc */ |
|
|
|
208 s[3] = 0x80 | (r & 0x00003F); /* 10dddddd */ |
|
|
|
209 return 4; |
|
|
|
210 } |
|
|
|
211 } |
|
|
|
212 |
|
|
|
213 static int |
|
|
|
214 namedentitytostr(const char *e, char *buf, size_t bufsiz) |
|
|
|
215 { |
|
|
|
216 static const struct { |
|
|
|
217 const char *entity; |
|
|
|
218 int c; |
|
|
|
219 } entities[] = { |
|
|
|
220 { "amp;", '&' }, |
|
|
|
221 { "lt;", '<' }, |
|
|
|
222 { "gt;", '>' }, |
|
|
|
223 { "apos;", '\'' }, |
|
|
|
224 { "quot;", '"' }, |
|
|
|
225 }; |
|
|
|
226 size_t i; |
|
|
|
227 |
|
|
|
228 /* buffer is too small */ |
|
|
|
229 if (bufsiz < 2) |
|
|
|
230 return -1; |
|
|
|
231 |
|
|
|
232 for (i = 0; i < sizeof(entities) / sizeof(*entities); i++) { |
|
|
|
233 if (!strcmp(e, entities[i].entity)) { |
|
|
|
234 buf[0] = entities[i].c; |
|
|
|
235 buf[1] = '\0'; |
|
|
|
236 return 1; |
|
|
|
237 } |
|
|
|
238 } |
|
|
|
239 return -1; |
|
|
|
240 } |
|
|
|
241 |
|
|
|
242 static int |
|
|
|
243 numericentitytostr(const char *e, char *buf, size_t bufsiz) |
|
|
|
244 { |
|
|
|
245 long l; |
|
|
|
246 int len; |
|
|
|
247 char *end; |
|
|
|
248 |
|
|
|
249 /* buffer is too small */ |
|
|
|
250 if (bufsiz < 5) |
|
|
|
251 return -1; |
|
|
|
252 |
|
|
|
253 errno = 0; |
|
|
|
254 /* hex (16) or decimal (10) */ |
|
|
|
255 if (*e == 'x') |
|
|
|
256 l = strtol(++e, &end, 16); |
|
|
|
257 else |
|
|
|
258 l = strtol(e, &end, 10); |
|
|
|
259 /* invalid value or not a well-formed entity or invalid code point */ |
|
|
|
260 if (errno || e == end || *end != ';' || l < 0 || l > 0x10ffff) |
|
|
|
261 return -1; |
|
|
|
262 len = codepointtoutf8(l, buf); |
|
|
|
263 buf[len] = '\0'; |
|
|
|
264 |
|
|
|
265 return len; |
|
|
|
266 } |
|
|
|
267 |
|
|
|
268 /* convert named- or numeric entity string to buffer string |
|
|
|
269 * returns byte-length of string or -1 on failure. */ |
|
|
|
270 int |
|
|
|
271 xml_entitytostr(const char *e, char *buf, size_t bufsiz) |
|
|
|
272 { |
|
|
|
273 /* doesn't start with & */ |
|
|
|
274 if (e[0] != '&') |
|
|
|
275 return -1; |
|
|
|
276 /* numeric entity */ |
|
|
|
277 if (e[1] == '#') |
|
|
|
278 return numericentitytostr(e + 2, buf, bufsiz); |
|
|
|
279 else /* named entity */ |
|
|
|
280 return namedentitytostr(e + 1, buf, bufsiz); |
|
|
|
281 } |
|
|
|
282 |
|
|
|
283 void |
|
|
|
284 xml_parse(XMLParser *x) |
|
|
|
285 { |
|
|
|
286 size_t datalen, tagdatalen; |
|
|
|
287 int c, isend; |
|
|
|
288 |
|
|
|
289 while ((c = GETNEXT()) != EOF && c != '<') |
|
|
|
290 ; /* skip until < */ |
|
|
|
291 |
|
|
|
292 while (c != EOF) { |
|
|
|
293 if (c == '<') { /* parse tag */ |
|
|
|
294 if ((c = GETNEXT()) == EOF) |
|
|
|
295 return; |
|
|
|
296 |
|
|
|
297 if (c == '!') { /* cdata and comments */ |
|
|
|
298 for (tagdatalen = 0; (c = GETNEXT()) != EOF;) { |
|
|
|
299 /* NOTE: sizeof(x->data) must be atleast sizeof("[CDATA[") */ |
|
|
|
300 if (tagdatalen <= sizeof("[CDATA[") - 1) |
|
|
|
301 x->data[tagdatalen++] = c; |
|
|
|
302 if (c == '>') |
|
|
|
303 break; |
|
|
|
304 else if (c == '-' && tagdatalen == sizeof("--") - 1 && |
|
|
|
305 (x->data[0] == '-')) { |
|
|
|
306 xml_parsecomment(x); |
|
|
|
307 break; |
|
|
|
308 } else if (c == '[') { |
|
|
|
309 if (tagdatalen == sizeof("[CDATA[") - 1 && |
|
|
|
310 !strncmp(x->data, "[CDATA[", tagdatalen)) { |
|
|
|
311 xml_parsecdata(x); |
|
|
|
312 break; |
|
|
|
313 } |
|
|
|
314 } |
|
|
|
315 } |
|
|
|
316 } else { |
|
|
|
317 /* normal tag (open, short open, close), processing instruction. */ |
|
|
|
318 x->tag[0] = c; |
|
|
|
319 x->taglen = 1; |
|
|
|
320 x->isshorttag = isend = 0; |
|
|
|
321 |
|
|
|
322 /* treat processing instruction as shorttag, don't strip "?" prefix. */ |
|
|
|
323 if (c == '?') { |
|
|
|
324 x->isshorttag = 1; |
|
|
|
325 } else if (c == '/') { |
|
|
|
326 if ((c = GETNEXT()) == EOF) |
|
|
|
327 return; |
|
|
|
328 x->tag[0] = c; |
|
|
|
329 isend = 1; |
|
|
|
330 } |
|
|
|
331 |
|
|
|
332 while ((c = GETNEXT()) != EOF) { |
|
|
|
333 if (c == '/') |
|
|
|
334 x->isshorttag = 1; /* short tag */ |
|
|
|
335 else if (c == '>' || isspace(c)) { |
|
|
|
336 x->tag[x->taglen] = '\0'; |
|
|
|
337 if (isend) { /* end tag, starts with </ */ |
|
|
|
338 if (x->xmltagend) |
|
|
|
339 x->xmltagend(x, x->tag, x->taglen, x->isshorttag); |
|
|
|
340 x->tag[0] = '\0'; |
|
|
|
341 x->taglen = 0; |
|
|
|
342 } else { |
|
|
|
343 /* start tag */ |
|
|
|
344 if (x->xmltagstart) |
|
|
|
345 x->xmltagstart(x, x->tag, x->taglen); |
|
|
|
346 if (isspace(c)) |
|
|
|
347 xml_parseattrs(x); |
|
|
|
348 if (x->xmltagstartparsed) |
|
|
|
349 x->xmltagstartparsed(x, x->tag, x->taglen, x->isshorttag); |
|
|
|
350 } |
|
|
|
351 /* call tagend for shortform or processing instruction */ |
|
|
|
352 if (x->isshorttag) { |
|
|
|
353 if (x->xmltagend) |
|
|
|
354 x->xmltagend(x, x->tag, x->taglen, x->isshorttag); |
|
|
|
355 x->tag[0] = '\0'; |
|
|
|
356 x->taglen = 0; |
|
|
|
357 } |
|
|
|
358 break; |
|
|
|
359 } else if (x->taglen < sizeof(x->tag) - 1) |
|
|
|
360 x->tag[x->taglen++] = c; /* NOTE: tag name truncation */ |
|
|
|
361 } |
|
|
|
362 } |
|
|
|
363 } else { |
|
|
|
364 /* parse tag data */ |
|
|
|
365 datalen = 0; |
|
|
|
366 if (x->xmldatastart) |
|
|
|
367 x->xmldatastart(x); |
|
|
|
368 while ((c = GETNEXT()) != EOF) { |
|
|
|
369 if (c == '&') { |
|
|
|
370 if (datalen) { |
|
|
|
371 x->data[datalen] = '\0'; |
|
|
|
372 if (x->xmldata) |
|
|
|
373 x->xmldata(x, x->data, datalen); |
|
|
|
374 } |
|
|
|
375 x->data[0] = c; |
|
|
|
376 datalen = 1; |
|
|
|
377 while ((c = GETNEXT()) != EOF) { |
|
|
|
378 if (c == '<') |
|
|
|
379 break; |
|
|
|
380 if (datalen < sizeof(x->data) - 1) |
|
|
|
381 x->data[datalen++] = c; |
|
|
|
382 else { |
|
|
|
383 /* entity too long for buffer, handle as normal data */ |
|
|
|
384 x->data[datalen] = '\0'; |
|
|
|
385 if (x->xmldata) |
|
|
|
386 x->xmldata(x, x->data, datalen); |
|
|
|
387 x->data[0] = c; |
|
|
|
388 datalen = 1; |
|
|
|
389 break; |
|
|
|
390 } |
|
|
|
391 if (c == ';') { |
|
|
|
392 x->data[datalen] = '\0'; |
|
|
|
393 if (x->xmldataentity) |
|
|
|
394 x->xmldataentity(x, x->data, datalen); |
|
|
|
395 datalen = 0; |
|
|
|
396 break; |
|
|
|
397 } |
|
|
|
398 } |
|
|
|
399 } else if (c != '<') { |
|
|
|
400 if (datalen < sizeof(x->data) - 1) { |
|
|
|
401 x->data[datalen++] = c; |
|
|
|
402 } else { |
|
|
|
403 x->data[datalen] = '\0'; |
|
|
|
404 if (x->xmldata) |
|
|
|
405 x->xmldata(x, x->data, datalen); |
|
|
|
406 x->data[0] = c; |
|
|
|
407 datalen = 1; |
|
|
|
408 } |
|
|
|
409 } |
|
|
|
410 if (c == '<') { |
|
|
|
411 x->data[datalen] = '\0'; |
|
|
|
412 if (x->xmldata && datalen) |
|
|
|
413 x->xmldata(x, x->data, datalen); |
|
|
|
414 if (x->xmldataend) |
|
|
|
415 x->xmldataend(x); |
|
|
|
416 break; |
|
|
|
417 } |
|
|
|
418 } |
|
|
|
419 } |
|
|
|
420 } |
|
|
|
421 } |
|