ALib C++ Library
Library Version: 2402 R1
Documentation generated by doxygen
Loading...
Searching...
No Matches
tokenizer.hpp
Go to the documentation of this file.
1/** ************************************************************************************************
2 * \file
3 * This header file is part of module \alib_strings of the \aliblong.
4 *
5 * \emoji :copyright: 2013-2024 A-Worx GmbH, Germany.
6 * Published under \ref mainpage_license "Boost Software License".
7 **************************************************************************************************/
8#ifndef HPP_ALIB_STRINGS_UTIL_TOKENIZER
9#define HPP_ALIB_STRINGS_UTIL_TOKENIZER 1
10
11#if !defined (HPP_ALIB_STRINGS_SUBSTRING)
13#endif
14
15#if !defined (HPP_ALIB_STRINGS_LOCALSTRING)
17#endif
18
19namespace alib { namespace strings {
20
21/**
22 * This sub-namespace provides some utility classes which are related
23 * to string classes found in namespace \ref alib::strings.
24 */
25namespace util {
26
27
28/** ************************************************************************************************
29 * This class operates on strings which contains data separated by a delimiter character.
30 * It identifies the sub-strings between the delimiters as \e tokens of type
31 * \alib{strings;TSubstring;Substring}. After an instance of this class is constructed,
32 * three methods are available:
33 * - #HasNext: Indicates if there are further tokens available.
34 * - #Next: Sets field #Actual (which is of type \b Substring) to reference the next token and
35 * returns it.<br>
36 * With each call to %Next, a different delimiter can be provided, which then serves as the
37 * delimiter for this and subsequent tokens.<br>
38 * The returned token by default will be trimmed according to the current trimable characters.
39 * - #Rest:
40 * Like #Next, however returns the complete remaining region without searching for
41 * further delimiters (and tokens).<br>
42 * After this method was invoked, #HasNext() will return \c false.
43 *
44 * After a token was retrieved, it might be modified using the interface of class
45 * \alib{strings;TSubstring;Substring} as the tokenizer does not rely on the bounds of
46 * the current token when receiving the next. Furthermore, even field #Rest is allowed
47 * to be changed using the interface of \b %Substring if it seems appropriate. The effect is the
48 * same as if method #Set was invoked to apply a different source string.
49 *
50 * Objects of this class can be reused by freshly initializing them using method #Set.
51 *
52 * <b>Sample code</b>:<br>
53 * The following code sample shows how to tokenize a string:
54 *
55 * \snippet "DOX_ALIB_TOKENIZER.cpp" DOX_ALIB_TOKENIZER
56 *
57 * The output will be:
58 *
59 * \verbinclude "DOX_ALIB_TOKENIZER.txt"
60 *
61 * @tparam TChar The character type. Implementations for \c nchar and \c wchar are provided
62 * with type definitions \ref alib::TokenizerN and
63 * \ref alib::TokenizerW.
64 **************************************************************************************************/
65template<typename TChar>
67{
68 // #############################################################################################
69 // Public fields
70 // #############################################################################################
71 public:
72 /**
73 * A \alib{strings;TSubstring;Substring} that represents the part of
74 * the underlying data that has not been tokenized, yet.
75 * It is allowed to manipulate this public field, which has a similar effect as
76 * using method #Set.<br>
77 */
79
80 /**
81 * The actual token, which is returned with every invocation of #Next() or #Rest().
82 * It is allowed to manipulate this field any time.<br>
83 */
85
86 /**
87 * The white spaces characters used to trim the tokens.
88 * Defaults to \ref alib::DefaultWhitespaces
89 */
91
92
93 // #############################################################################################
94 // Internal fields
95 // #############################################################################################
96 protected:
97 /** The most recently set delimiter used by default for the next token extraction. */
98 TChar delim;
99
100 /** If \c true, empty tokens are omitted. */
102
103
104 // #############################################################################################
105 // Constructors/Destructor
106 // #############################################################################################
107 public:
108 /** ****************************************************************************************
109 * Constructs an empty tokenizer. To initialize, method #Set needs to be invoked.
110 ******************************************************************************************/
112 {}
113
114 /** ****************************************************************************************
115 * Constructs a tokenizer to work on a given string.
116 *
117 * @param src The string to be tokenized.
118 * @param delimiter The delimiter that separates the tokens. Can be changed with
119 * every next token.
120 * @param skipEmptyTokens If \c true, empty tokens are omitted.
121 * Optional and defaults to \c false.
122 ******************************************************************************************/
123 TTokenizer( const TString<TChar>& src, TChar delimiter, bool skipEmptyTokens= false )
124 : Rest (src)
125 , Actual(nullptr)
127 , delim(delimiter)
128 , skipEmpty(skipEmptyTokens)
129 {}
130
131 // #############################################################################################
132 // Interface
133 // #############################################################################################
134 public:
135 /** ****************************************************************************************
136 * Resets a tokenizer to work on a given string.
137 *
138 * @param src The string to be tokenized
139 * @param delimiter The delimiter that separates the tokens. Can be changed with
140 * every next token.
141 * @param skipEmptyTokens If \c true, empty tokens are omitted.
142 * Optional and defaults to \c false.
143 ******************************************************************************************/
144 void Set( const TString<TChar>& src, TChar delimiter, bool skipEmptyTokens= false )
145 {
146 Actual = nullptr;
147 Rest = src;
148 this->delim = delimiter;
149 this->skipEmpty = skipEmptyTokens;
150 }
151
152 /** ****************************************************************************************
153 * Returns the next token, which is afterwards also available through field #Actual.
154 * If no further token was available, the returned
155 * \alib{strings;TSubstring;Substring} will be \e nulled.
156 * (see \alib{strings;TString::IsNull;String::IsNull}).
157 * To prevent this, the availability of a next token should be
158 * checked using method #HasNext().
159 *
160 * For clarification, see the explanation and sample code in this classes documentation.
161 *
162 * @param trimming Determines if the token is trimmed in respect to the white space
163 * characters defined in field #TrimChars.
164 * Defaults to \b Whitespaces.Trim.
165 * @param newDelim The delimiter separates the tokens. Defaults to 0, which keeps the
166 * current delimiter intact.
167 * A new delimiter can be provided for every next token.
168 * @return \c true if a next token was available, \c false if not.
169 ******************************************************************************************/
172 TChar newDelim= '\0' );
173
174 /** ****************************************************************************************
175 * Returns the currently remaining string (without searching for further delimiter
176 * characters).
177 * After this call #HasNext will return \c false and #Next will return a \e nulled
178 * Substring.
179 * @param trimming Determines if the token is trimmed in respect to the white space
180 * characters defined in field #TrimChars.
181 * Defaults to \b Whitespaces.Trim.
182 * @return The rest of the original source string, which was not returned by #Next(), yet.
183 ******************************************************************************************/
185 {
186 // set start, end and end of tokenizer
187 Actual= Rest;
188 Rest = nullptr;
189 if ( trimming == lang::Whitespaces::Trim )
190 Actual.Trim( TrimChars );
191 return Actual;
192 }
193
194 /** ****************************************************************************************
195 * If this returns \c true, a call to #Next will be successful and will return a
196 * \b Substring which is not \e nulled.
197 * @return \c true if a next token is available.
198 ******************************************************************************************/
199 bool HasNext()
200 {
201 return Rest.IsNotNull() && ( !skipEmpty || Rest.IsNotEmpty() );
202 }
203
204}; // class Tokenizer
205
206
209
210}} // namespace alib[::strings::util]
211
212/// Type alias in namespace \b alib.
214
215/// Type alias in namespace \b alib.
217
218/// Type alias in namespace \b alib.
220
221
222
223} // namespace [alib]
224
225#endif // HPP_ALIB_STRINGS_UTIL_TOKENIZER
TLocalString< TChar, 8 > TrimChars
Definition tokenizer.hpp:90
TSubstring< TChar > & GetRest(lang::Whitespaces trimming=lang::Whitespaces::Trim)
ALIB_API TSubstring< TChar > & Next(lang::Whitespaces trimming=lang::Whitespaces::Trim, TChar newDelim='\0')
Definition tokenizer.cpp:18
TTokenizer(const TString< TChar > &src, TChar delimiter, bool skipEmptyTokens=false)
TSubstring< TChar > Actual
Definition tokenizer.hpp:84
void Set(const TString< TChar > &src, TChar delimiter, bool skipEmptyTokens=false)
#define ALIB_API
Definition alib.hpp:538
@ Trim
Trim whitespaces away.
Definition alib.cpp:57
constexpr CString DefaultWhitespaces()
Definition cstring.hpp:554
characters::wchar wchar
Type alias in namespace alib.
characters::nchar nchar
Type alias in namespace alib.