// ПРИМЕР ИСПОЛЬЗОВАНИЯ:
uzing namespace tool;
const char *s = "var a=1+1; b='string'; etc. "; // это мы будем парсить
tokenz tk(s,strlen(s));
tk.break_chars("+-/*%,;=\n"); // на этом будем 'тормозить'
int token;
// ну типа парсим как настояшши пацаны...
while(token = tk.token())
{
switch(token)
{
case tokenz::WORD_VALUE:
printf("token = %s\n",
(const char*
)tk.
token_value());
break;
case tokenz::STRING_VALUE:
printf("string literal = %c%s%c\n",
tk.quote_used(),
(const char*)tk.token_value(),
tk.quote_used());
break;
case tokenz::BREAK_CHAR:
printf("break = %c\n",tk.
break_used());
break;
}
}
// собственно конец. (извиняюсь)
// Файл tl_tokenizer.h >>
//|
//| tl_tokenizer.h
//|
//| Copyright (c) 2001, 2002
//| Andrew Fedoniouk - andrew@terra-informatica.org
//| Portions: Serge Kuznetsov (a.k.a. "ComputerMage") - kuznetsov@deeptown.org
//|
#ifndef __tl_tokenizer_h
#define __tl_tokenizer_h
//|
//|
//| (semi)universal tokenizer.
//|
//|
#include "tl_string.h"
namespace tool
{
class tokenz
{
public:
enum cvt_flag
{
cvt_no = 0,
cvt_to_upper = 1,
cvt_to_lower = 2
};
enum token_types {
END_OF_TEXT = 0,
BREAK_CHAR,
WORD_VALUE,
STRING_VALUE
};
protected:
int _p_state; // current state
cvt_flag _p_flag; // option flag
char _p_curquote; // current quote char
string _token; // last token value
const char* _text; // input text
const char* _text_end; // input text end
const char* _pos; // current pos in input
string _whites;
string _breaks;
string _quotes;
char _eschar;
char _break_used;
char _quote_used;
public:
tokenz ( const char * text, size_t text_length, cvt_flag flag = cvt_no );
void white_chars ( const char * ps ) { _whites = ps; }
void break_chars ( const char * ps ) { _breaks = ps; }
void quote_chars ( const char * ps ) { _quotes = ps; }
int token();
string token_value();
char break_used() const { return _break_used; }
char quote_used() const { return _quote_used; }
protected:
int sindex ( char ch, const char *str );
};
}
#endif //__cs_parser_h
// Файл tl_tokenizer.cpp >>
#include "tl_tokenizer.h"
namespace tool
{
enum parser_states
{
IN_WHITE,
IN_TOKEN,
IN_QUOTE,
};
tokenz::tokenz ( const char * text, size_t text_length, cvt_flag flag ) :
_text(text),
_text_end(text + text_length),
_pos(text),
_p_flag(flag),
_p_state(IN_WHITE),
_p_curquote(0),
_whites(" \t\r"), // blank and tab
_breaks(",;=\n"), // comma and carriage return
_quotes("'\""), // single and double quote
_eschar('\\') // "bakslash" is escape
{
}
// look up character in string
int
tokenz::sindex ( char ch, const char * str )
{
const char * cp;
for ( cp = str; *cp; ++cp )
if ( ch == *cp )
return (int) ( cp - str ); // return postion of character
return -1; // eol ... no match found
}
string tokenz::token_value ()
{
if ( _p_state == IN_QUOTE )
return _token;
switch ( _p_flag )
{
case cvt_to_upper: // convert to upper
return _token.to_upper();
case cvt_to_lower: // convert to lower
return _token.to_lower();
default: // use as is
return _token;
}
}
// here it is!
int tokenz::token()
{
if(_pos >= _text_end)
return END_OF_TEXT;
int qp;
char c, nc;
_break_used = 0; // initialize to null
_quote_used = 0; // assume not quoted
_token.clear();
_p_state = IN_WHITE; // initialize state
_p_curquote = 0; // initialize previous quote char
for ( ; _pos < _text_end; ++_pos ) // main loop
{
c = *_pos;
if ( ( qp = sindex ( c, _breaks ) ) >= 0 ) // break
{
switch ( _p_state )
{
case IN_WHITE:
++_pos;
_break_used = _breaks[qp];
return BREAK_CHAR;
case IN_TOKEN: // ... get out
return WORD_VALUE;
case IN_QUOTE: // keep going
_token += c;
break;
}
}
else if ( ( qp = sindex ( c, _quotes ) ) >= 0 ) // quote
{
switch ( _p_state )
{
case IN_WHITE: // these are identical,
_p_state = IN_QUOTE; // change states
_p_curquote = _quotes [ qp ]; // save quote char
_quote_used = _p_curquote; // set it as long as
break; // something is in quotes
case IN_QUOTE:
if ( _quotes [ qp ] == _p_curquote ) // same as the beginning quote?
{
_p_state = IN_WHITE;
_p_curquote = 0;
++_pos;
return STRING_VALUE;
}
else
_token += c; // treat as regular char
break;
case IN_TOKEN:
_break_used = c; // uses quote as break char
_pos++;
return WORD_VALUE;
}
}
else if ( ( qp = sindex ( c, _whites ) ) >= 0 ) // white
{
switch ( _p_state )
{
case IN_WHITE:
break; // keep going
case IN_TOKEN:
++_pos;
return WORD_VALUE;
case IN_QUOTE:
_token += c; // it's valid here
break;
}
}
else if ( c == _eschar && (_pos < (_text_end - 1)) ) // escape
{
nc = *(_pos + 1);
switch ( _p_state )
{
case IN_WHITE:
--_pos;
_p_state = IN_TOKEN;
break;
case IN_TOKEN:
case IN_QUOTE:
++_pos;
_token += nc;
break;
}
}
else // anything else is just a real character
{
switch ( _p_state )
{
case IN_WHITE:
_p_state = IN_TOKEN; // switch states
case IN_TOKEN: // these too are
case IN_QUOTE: // identical here
_token += c;
break;
}
}
}
// main loop
switch ( _p_state )
{
case IN_TOKEN:
case IN_QUOTE:
return WORD_VALUE;
}
return END_OF_TEXT;
}
}