ALib C++ Library
Library Version: 2412 R0
Documentation generated by doxygen
Loading...
Searching...
No Matches
tokenizer.hpp
Go to the documentation of this file.
1//==================================================================================================
2/// \file
3/// This header file is part of module \alib_strings of the \aliblong.
4///
5/// \emoji :copyright: 2013-2024 A-Worx GmbH, Germany.
6/// Published under \ref mainpage_license "Boost Software License".
7//==================================================================================================
8#ifndef HPP_ALIB_STRINGS_UTIL_TOKENIZER
9#define HPP_ALIB_STRINGS_UTIL_TOKENIZER 1
10#pragma once
13
14namespace alib { namespace strings {
15
16/// This sub-namespace provides some utility classes which are related
17/// to string classes found in namespace \ref alib::strings.
18namespace util {
19
20
21//==================================================================================================
22/// This class operates on strings which contains data separated by a delimiter character.
23/// It identifies the substrings between the delimiters as \e tokens of type
24/// \alib{strings;TSubstring;Substring}. After an instance of this class is constructed,
25/// three methods are available:
26/// - #HasNext: Indicates if there are further tokens available.
27/// - #Next: Sets field #Actual (which is of type \b Substring) to reference the next token and
28/// returns it.<br>
29/// With each call to %Next, a different delimiter can be provided, which then serves as the
30/// delimiter for this and subsequent tokens.<br>
31/// The returned token by default will be trimmed according to the current trimable characters.
32/// - #Rest:
33/// Like #Next, however returns the complete remaining region without searching for
34/// further delimiters (and tokens).<br>
35/// After this method was invoked, #HasNext() will return \c false.
36///
37/// After a token was retrieved, it might be modified using the interface of class
38/// \alib{strings;TSubstring;Substring} as the tokenizer does not rely on the bounds of
39/// the current token when receiving the next. Furthermore, even field #Rest is allowed
40/// to be changed using the interface of \b %Substring if it seems appropriate. The effect is the
41/// same as if method #Set was invoked to apply a different source string.
42///
43/// Objects of this class can be reused by freshly initializing them using method #Set.
44///
45/// <b>Sample code</b>:<br>
46/// The following code sample shows how to tokenize a string:
47///
48/// \snippet "DOX_TOKENIZER.cpp" DOX_TOKENIZER
49///
50/// The output will be:
51///
52/// \verbinclude "DOX_TOKENIZER.txt"
53///
54/// @tparam TChar The character type. Implementations for \c nchar and \c wchar are provided
55/// with type definitions \ref alib::TokenizerN and
56/// \ref alib::TokenizerW.
57//==================================================================================================
58template<typename TChar>
60{
61 // #############################################################################################
62 // Public fields
63 // #############################################################################################
64 public:
65 /// A \alib{strings;TSubstring;Substring} that represents the part of
66 /// the underlying data that has not been tokenized, yet.
67 /// It is allowed to manipulate this public field, which has a similar effect as
68 /// using method #Set.<br>
70
71 /// The actual token, which is returned with every invocation of #Next() or #Rest().
72 /// It is allowed to manipulate this field any time.<br>
74
75 /// The white spaces characters used to trim the tokens.
76 /// Defaults to \ref alib::DEFAULT_WHITESPACES
78
79
80 // #############################################################################################
81 // Internal fields
82 // #############################################################################################
83 protected:
84 /// The most recently set delimiter used by default for the next token extraction.
85 TChar delim;
86
87 /// If \c true, empty tokens are omitted.
89
90
91 // #############################################################################################
92 // Constructors/Destructor
93 // #############################################################################################
94 public:
95 //==========================================================================================
96 /// Constructs an empty tokenizer. To initialize, method #Set needs to be invoked.
97 //==========================================================================================
99 {}
100
101 //==========================================================================================
102 /// Constructs a tokenizer to work on a given string.
103 ///
104 /// @param src The string to be tokenized.
105 /// @param delimiter The delimiter that separates the tokens. Can be changed with
106 /// every next token.
107 /// @param skipEmptyTokens If \c true, empty tokens are omitted.
108 /// Optional and defaults to \c false.
109 //==========================================================================================
110 TTokenizer( const TString<TChar>& src, TChar delimiter, bool skipEmptyTokens= false )
111 : Rest (src)
112 , Actual(nullptr)
113 , TrimChars( TT_CStringConstants<TChar>::DefaultWhitespaces() )
114 , delim(delimiter)
115 , skipEmpty(skipEmptyTokens)
116 {}
117
118 // #############################################################################################
119 // Interface
120 // #############################################################################################
121 public:
122 //==========================================================================================
123 /// Resets a tokenizer to work on a given string.
124 ///
125 /// @param src The string to be tokenized
126 /// @param delimiter The delimiter that separates the tokens. Can be changed with
127 /// every next token.
128 /// @param skipEmptyTokens If \c true, empty tokens are omitted.
129 /// Optional and defaults to \c false.
130 //==========================================================================================
131 void Set( const TString<TChar>& src, TChar delimiter, bool skipEmptyTokens= false )
132 {
133 Actual = nullptr;
134 Rest = src;
135 this->delim = delimiter;
136 this->skipEmpty = skipEmptyTokens;
137 }
138
139 //==========================================================================================
140 /// Returns the next token, which is afterwards also available through field #Actual.
141 /// If no further token was available, the returned
142 /// \alib{strings;TSubstring;Substring} will be \e nulled.
143 /// (see \alib{strings;TString::IsNull;String::IsNull}).
144 /// To prevent this, the availability of a next token should be
145 /// checked using method #HasNext().
146 ///
147 /// For clarification, see the explanation and sample code in this classes documentation.
148 ///
149 /// @param trimming Determines if the token is trimmed in respect to the white space
150 /// characters defined in field #TrimChars.
151 /// Defaults to \b Whitespaces.Trim.
152 /// @param newDelim The delimiter separates the tokens. Defaults to 0, which keeps the
153 /// current delimiter intact.
154 /// A new delimiter can be provided for every next token.
155 /// @return The next token as \b %Substring. A nulled string is if no next token was
156 /// available.
157 //==========================================================================================
160 TChar newDelim= '\0' );
161
162 //==========================================================================================
163 /// Returns the currently remaining string (without searching for further delimiter
164 /// characters).
165 /// After this call #HasNext will return \c false and #Next will return a \e nulled
166 /// Substring.
167 /// @param trimming Determines if the token is trimmed in respect to the white space
168 /// characters defined in field #TrimChars.
169 /// Defaults to \b Whitespaces.Trim.
170 /// @return The rest of the original source string, which was not returned by #Next(), yet.
171 //==========================================================================================
173 {
174 // set start, end and end of tokenizer
175 Actual= Rest;
176 Rest = nullptr;
177 if ( trimming == lang::Whitespaces::Trim )
178 Actual.Trim( TrimChars );
179 return Actual;
180 }
181
182 //==========================================================================================
183 /// If this returns \c true, a call to #Next will be successful and will return a
184 /// \b Substring which is not \e nulled.
185 /// @return \c true if a next token is available.
186 //==========================================================================================
187 bool HasNext()
188 {
189 return Rest.IsNotNull() && ( !skipEmpty || Rest.IsNotEmpty() );
190 }
191
192}; // class Tokenizer
193
194
197
198}} // namespace alib[::strings::util]
199
200/// Type alias in namespace \b alib.
202
203/// Type alias in namespace \b alib.
205
206/// Type alias in namespace \b alib.
208
209
210
211} // namespace [alib]
212
213#endif // HPP_ALIB_STRINGS_UTIL_TOKENIZER
214
TLocalString< TChar, 8 > TrimChars
Definition tokenizer.hpp:77
bool skipEmpty
If true, empty tokens are omitted.
Definition tokenizer.hpp:88
TSubstring< TChar > & GetRest(lang::Whitespaces trimming=lang::Whitespaces::Trim)
ALIB_API TSubstring< TChar > & Next(lang::Whitespaces trimming=lang::Whitespaces::Trim, TChar newDelim='\0')
Definition tokenizer.cpp:16
TTokenizer(const TString< TChar > &src, TChar delimiter, bool skipEmptyTokens=false)
TTokenizer()
Constructs an empty tokenizer. To initialize, method Set needs to be invoked.
Definition tokenizer.hpp:98
TChar delim
The most recently set delimiter used by default for the next token extraction.
Definition tokenizer.hpp:85
TSubstring< TChar > Actual
Definition tokenizer.hpp:73
void Set(const TString< TChar > &src, TChar delimiter, bool skipEmptyTokens=false)
#define ALIB_API
Definition alib.hpp:639
Whitespaces
Denotes whether a string is trimmed or not.
@ Trim
Trim whitespaces away.
Definition alib.cpp:69
characters::wchar wchar
Type alias in namespace alib.
characters::nchar nchar
Type alias in namespace alib.