#include <XMLTokenizer.h>
Public Types | |
enum | TokenType { END, ERROR, OPEN, ATTR, CLOSE, ECLOSE, ETAG, TEXT } |
The kind of token last read. More... | |
Public Methods | |
XMLTokenizer (std::istream &, int startline=1) | |
Create an XML tokenizer from the given input stream. | |
TokenType | next_token () |
Return the next token type in the input stream. | |
void | save_token () |
Push back the current token. | |
bool | inside_tag () |
int | get_line_number () const |
Return the current line number. | |
Public Attributes | |
TokenType | curr_token |
std::string | curr_name |
std::string | curr_text |
Private Methods | |
void | skip_whitespace () |
Read all space characters from the stream. | |
void | read_name () |
Read an identifier from the stream. | |
void | read_attr_val () |
Read an XML attribute value occurring after an '=' in a tag. | |
std::string | lookup_entity (const std::string &name) |
Convert a named XML entity to a string. | |
Private Attributes | |
std::istream & | input |
int | lineno |
bool | saved |
It does not tokenize full XML. XML Comments and whitespace inside and around tags omitted. Plain text accepted elsewhere. For an explanation of the XML subset accepted, please see our XML primer.
|
The kind of token last read.
|
|
Create an XML tokenizer from the given input stream. The starting line initializes the line number.
00016 : curr_token(ERROR), input(i), lineno(startline), saved(false) 00017 {} |
|
Return the current line number.
00130 { 00131 return lineno; 00132 } |
|
00049 { return (curr_token == OPEN || curr_token == ATTR); } |
|
Convert a named XML entity to a string. This handles entities such as " in text.
00202 { 00203 if (name == "amp") return "&"; 00204 else if (name == "lt") return "<"; 00205 else if (name == "gt") return ">"; 00206 else if (name == "quot") return "\""; 00207 else if (name == "apos") return "'"; 00208 else if (name == "sp") return " "; 00209 else 00210 throw Error("Unknown entity &" + name + ";"); 00211 } |
|
Return the next token type in the input stream. This sets currToken, currName, and currText as appropriate.
00020 { 00021 // This routine does almost all the work of the tokenizer. 00022 if (saved) { 00023 saved = false; 00024 return curr_token; 00025 } else { 00026 // if there is a problem in tokenizing, 00027 // we throw an exception. 00028 // This makes the code simpler to follow. 00029 try { 00030 // Skip whitespace at the beginning always. 00031 skip_whitespace(); 00032 int ch = input.get(); 00033 if (ch == EOF) { 00034 return curr_token = END; 00035 } 00036 // If a tag has started, we 00037 // want to look for attribute bindings or for 00038 // an end tag: 00039 if (inside_tag()) { 00040 if (ch == '>') { 00041 return curr_token = CLOSE; 00042 } else if (ch == '/') { 00043 if (input.get() == '>') { 00044 return curr_token = ECLOSE; 00045 } else { 00046 throw Error("'/' loose inside tag"); 00047 } 00048 } else { 00049 input.putback(ch); 00050 read_name(); 00051 if (input.get() != '=') { 00052 throw Error("Missing '=' in attribute binding"); 00053 } 00054 read_attr_val(); 00055 return curr_token = ATTR; 00056 } 00057 } else { 00058 // Otherwise we look for nested elements, ... 00059 if (ch == '<') { 00060 ch = input.get(); 00061 if (ch == '/') { 00062 read_name(); 00063 if (input.get() != '>') { 00064 throw Error("Extra information in end tag"); 00065 } 00066 return curr_token = ETAG; 00067 } else if (ch == '!') { 00068 ch = input.get(); 00069 if (ch == '-') { 00070 if (input.get() != '-') { 00071 throw Error("Malformed comment (need two hypens)"); 00072 } 00073 while (input.ignore(INT_MAX,'-')) { 00074 if (input.get() == '-') break; 00075 } 00076 if (!input) { 00077 throw Error("Malformed comment (unterminated)"); 00078 } 00079 if (input.get() != '>') { 00080 throw Error("Malformed comment (missing final '>')"); 00081 } 00082 // try another read 00083 return next_token(); 00084 } else { 00085 input.putback(ch); 00086 read_name(); 00087 throw Error("Cannot handle <!" + curr_name + "> tags"); 00088 } 00089 } else if (ch == '?') { 00090 read_name(); 00091 throw Error("Cannot handle <?" + curr_name + "> tags"); 00092 } else { 00093 input.putback(ch); 00094 read_name(); 00095 return curr_token = OPEN; 00096 } 00097 } else { 00098 // ... or nested text. 00099 curr_text = ""; 00100 for (; input; ch=input.get()) { 00101 if (ch == '<') { 00102 input.putback(ch); 00103 break; 00104 } 00105 if (ch == '&') { 00106 read_name(); 00107 if (input.get() != ';') { 00108 throw Error("&name; construct missing semicolon"); 00109 } 00110 curr_text += lookup_entity(curr_name); 00111 continue; 00112 } 00113 if (ch == '\n') ++lineno; 00114 curr_text += ch; 00115 } 00116 return curr_token = TEXT; 00117 } 00118 } 00119 } catch (Error &e) { 00120 curr_text = e.reason; 00121 return curr_token = ERROR; 00122 } 00123 } 00124 } |
|
Read an XML attribute value occurring after an '=' in a tag. Currently it only accepts strings delineated by single or double quotes. If no such value found, an Error is thrown.
00181 { 00182 skip_whitespace(); 00183 char start; 00184 if (input.get(start) && (start == '"' || start == '\'')) { 00185 curr_text = ""; 00186 char ch; 00187 while (input.get(ch) && ch != start) { 00188 curr_text += ch; 00189 } 00190 if (ch != start) { 00191 throw Error("Unterminated attribute value"); 00192 } 00193 } else { 00194 throw Error("Attribute value must start with a quote"); 00195 } 00196 } |
|
Read an identifier from the stream. We first consume any whitespace and then look for a name according to the rules of XML: it starts with an alphabetic character or _ or : and continues with alphanumeric characters, periods, hyphens, underscores or colons. If no identifier is found, it throws an Error.
00155 { 00156 skip_whitespace(); 00157 char ch; 00158 if (input.get(ch) && 00159 isalpha(ch) || ch == '_' || ch == ':') { 00160 curr_name = ch; 00161 while (input.get(ch)) { 00162 if (isalnum(ch) || ch == '.' || ch == '-' || 00163 ch == '_' || ch == ':') { 00164 curr_name += ch; 00165 } else { 00166 input.putback(ch); 00167 break; 00168 } 00169 } 00170 } else { 00171 throw Error("Expected Name"); 00172 } 00173 skip_whitespace(); 00174 } |
|
Push back the current token. The next token returned will be this one. This does *not* change the currToken, currName or currText fields.
00126 { 00127 saved = true; 00128 } |
|
Read all space characters from the stream.
|
|
|
|
|
|
|
|
|
|
|
|
|