ionflux.org | Impressum

Utf8Tokenizer.hpp

Go to the documentation of this file.
00001 #ifndef IONFLUX_TOOLS_UTF8TOKENIZER
00002 #define IONFLUX_TOOLS_UTF8TOKENIZER
00003 /* ==========================================================================
00004  * Ionflux Tools
00005  * Copyright (c) 2005 Joern P. Meier
00006  * mail@ionflux.org
00007  * --------------------------------------------------------------------------
00008  * Utf8Tokenizer.hpp                 Tokenizer with UTF-8 support.
00009  * ==========================================================================
00010  * 
00011  * This file is part of Ionflux Tools.
00012  * 
00013  * Ionflux Tools is free software; you can redistribute it and/or modify it
00014  * under the terms of the GNU General Public License as published by the 
00015  * Free Software Foundation; either version 2 of the License, or (at your 
00016  * option) any later version.
00017  * 
00018  * Ionflux Tools is distributed in the hope that it will be useful, but 
00019  * WITHOUT ANY WARRANTY; without even the implied warranty of 
00020  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU 
00021  * General Public License for more details.
00022  * 
00023  * You should have received a copy of the GNU General Public License along 
00024  * with Ionflux Tools; if not, write to the Free Software Foundation, Inc.,
00025  * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
00026  * 
00027  * ========================================================================== */
00028 
00029 #include "ionflux/Utf8TokenTypeMap.hpp"
00030 #include "ionflux/ManagedObject.hpp"
00031 
00032 namespace Ionflux
00033 {
00034 
00035 namespace Tools
00036 {
00037 
00041 struct Utf8Token
00042 {
00044     int typeID;
00046     std::string value;
00047 };
00048 
00050 class Utf8TokenizerClassInfo
00051 : public Ionflux::Tools::ClassInfo
00052 {
00053     public:
00055         Utf8TokenizerClassInfo();
00057         virtual ~Utf8TokenizerClassInfo() { };
00058 };
00059 
00079 class Utf8Tokenizer
00080 : public Ionflux::Tools::ManagedObject
00081 {
00082     private:
00083         
00084     protected:
00086         std::vector<unsigned int> theInput;
00088         std::vector<unsigned int> quoteChars;
00090         unsigned int currentPos;
00092         unsigned int currentTokenPos;
00094         unsigned int currentQuoteChar;
00096         Utf8TokenTypeMap* typeMap;
00098         Utf8Token currentToken;
00100         bool extractQuoted;
00102         bool extractEscaped;
00103         
00104     public:
00106         static const Utf8TokenType TT_INVALID;
00108         static const Utf8TokenType TT_NONE;
00110         static const Utf8TokenType TT_DEFAULT;
00112         static const Utf8TokenType TT_QUOTED;
00114         static const Utf8TokenType TT_ESCAPED;
00116         static const Utf8TokenType TT_LINEAR_WHITESPACE;
00118         static const Utf8TokenType TT_LINETERM;
00120         static const Utf8TokenType TT_IDENTIFIER;
00122         static const Utf8TokenType TT_NUMBER;
00124         static const Utf8TokenType TT_ALPHA;
00126         static const Utf8TokenType TT_DEFAULT_SEP;
00128         static const Utf8TokenType TT_LATIN;
00130         static const Utf8Token TOK_INVALID;
00132         static const Utf8Token TOK_NONE;
00134         static const std::string QUOTE_CHARS;
00136         static const unsigned int ESCAPE_CHAR;
00138         static const Utf8TokenizerClassInfo utf8TokenizerClassInfo;
00140         static const Ionflux::Tools::ClassInfo* CLASS_INFO;
00141         
00146         Utf8Tokenizer();
00147         
00154         Utf8Tokenizer(const std::string& initInput);
00155         
00163         Utf8Tokenizer(const std::vector<Utf8TokenType>& initTokenTypes, const 
00164         std::string& initInput = "");
00165         
00170         virtual ~Utf8Tokenizer();
00171         
00176         virtual void reset();
00177         
00182         virtual void clearTokenTypes();
00183         
00189         virtual void useDefaultTokenTypes();
00190         
00196         virtual void addDefaultTokenType();
00197         
00204         virtual void setTokenTypes(const std::vector<Utf8TokenType>& 
00205         newTokenTypes);
00206         
00213         virtual void addTokenTypes(const std::vector<Utf8TokenType>& 
00214         newTokenTypes);
00215         
00222         virtual void addTokenType(const Utf8TokenType& newTokenType);
00223         
00230         virtual void setInput(const std::string& newInput);
00231         
00238         virtual void setInput(const std::vector<unsigned int>& newInput);
00239         
00250         virtual Utf8Token getNextToken(Utf8TokenTypeMap* otherTypeMap = 0);
00251         
00258         virtual Utf8Token getCurrentToken();
00259         
00266         virtual int getCurrentTokenType();
00267         
00274         virtual unsigned int getCurrentPos();
00275         
00282         virtual unsigned int getCurrentTokenPos();
00283         
00290         virtual unsigned int getQuoteChar();
00291         
00301         static bool isValid(const Utf8Token& checkToken);
00302         
00309         virtual void setExtractQuoted(bool newExtractQuoted);
00310         
00315         virtual bool getExtractQuoted() const;
00316         
00323         virtual void setExtractEscaped(bool newExtractEscaped);
00324         
00329         virtual bool getExtractEscaped() const;
00330 };
00331 
00332 }
00333 
00334 }
00335 
00339 #endif

Generated on Tue Mar 14 20:58:30 2006 for Ionflux Tools Class Library (iftools) by  doxygen 1.4.6