ALib C++ Library
Library Version: 2510 R0
Documentation generated by doxygen
Loading...
Searching...
No Matches
tokenizer.inl
Go to the documentation of this file.
1//==================================================================================================
2/// \file
3/// This header-file is part of module \alib_strings of the \aliblong.
4///
5/// \emoji :copyright: 2013-2025 A-Worx GmbH, Germany.
6/// Published under \ref mainpage_license "Boost Software License".
7//==================================================================================================
8ALIB_EXPORT namespace alib { namespace strings {
9
10/// This sub-namespace provides some utility classes which are related
11/// to string classes found in namespace \ref alib::strings.
12namespace util {
13
14//==================================================================================================
15/// This class operates on strings which contains data separated by a delimiter character.
16/// It identifies the substrings between the delimiters as \e tokens of type
17/// \alib{strings;TSubstring;Substring}. After an instance of this class is constructed,
18/// three methods are available:
19/// - #HasNext: Indicates if there are further tokens available.
20/// - #Next: Sets field #Actual (which is of type \b Substring) to reference the next token and
21/// returns it.<br>
22/// With each call to %Next, a different delimiter can be provided, which then serves as the
23/// delimiter for this and subsequent tokens.<br>
24/// The returned token by default will be trimmed according to the current trimable characters.
25/// - #Rest:
26/// Like #Next, however returns the complete remaining region without searching for
27/// further delimiters (and tokens).<br>
28/// After this method was invoked, #HasNext() will return \c false.
29///
30/// After a token was retrieved, it might be modified using the interface of class
31/// \alib{strings;TSubstring;Substring} as the tokenizer does not rely on the bounds of
32/// the current token when receiving the next. Furthermore, even field #Rest is allowed
33/// to be changed using the interface of \b %Substring if it seems appropriate. The effect is the
34/// same as if method #Set was invoked to apply a different source string.
35///
36/// Objects of this class can be reused by freshly initializing them using method #Set.
37///
38/// <b>Sample code</b>:<br>
39/// The following code sample shows how to tokenize a string:
40///
41/// \snippet "DOX_TOKENIZER.cpp" DOX_TOKENIZER
42///
43/// The output will be:
44///
45/// \verbinclude "DOX_TOKENIZER.txt"
46///
47/// @tparam TChar The character type. Implementations for \c nchar and \c wchar are provided
48/// with type definitions \ref alib::TokenizerN and
49/// \ref alib::TokenizerW.
50//==================================================================================================
51template<typename TChar>
53{
54 // #############################################################################################
55 // Public fields
56 // #############################################################################################
57 public:
58 /// A \alib{strings;TSubstring;Substring} that represents the part of
59 /// the underlying data that has not been tokenized, yet.
60 /// It is allowed to manipulate this public field, which has a similar effect as
61 /// using method #Set.<br>
63
64 /// The actual token, which is returned with every invocation of #Next() or #Rest().
65 /// It is allowed to manipulate this field any time.<br>
67
68 /// The white spaces characters used to trim the tokens.
69 /// Defaults to \ref alib::DEFAULT_WHITESPACES
71
72
73 // #############################################################################################
74 // Internal fields
75 // #############################################################################################
76 protected:
77 /// The most recently set delimiter used by default for the next token extraction.
78 TChar delim;
79
80 /// If \c true, empty tokens are omitted.
82
83
84 // #############################################################################################
85 // Constructors/Destructor
86 // #############################################################################################
87 public:
88 //==========================================================================================
89 /// Constructs an empty tokenizer. To initialize, method #Set needs to be invoked.
90 //==========================================================================================
92 {}
93
94 //==========================================================================================
95 /// Constructs a tokenizer to work on a given string.
96 ///
97 /// @param src The string to be tokenized.
98 /// @param delimiter The delimiter that separates the tokens. Can be changed with
99 /// every next token.
100 /// @param skipEmptyTokens If \c true, empty tokens are omitted.
101 /// Optional and defaults to \c false.
102 //==========================================================================================
103 TTokenizer( const TString<TChar>& src, TChar delimiter, bool skipEmptyTokens= false )
104 : Rest (src)
105 , Actual(nullptr)
106 , TrimChars( CStringConstantsTraits<TChar>::DefaultWhitespaces() )
107 , delim(delimiter)
108 , skipEmpty(skipEmptyTokens)
109 {}
110
111 // #############################################################################################
112 // Interface
113 // #############################################################################################
114 public:
115 //==========================================================================================
116 /// Resets a tokenizer to work on a given string.
117 ///
118 /// @param src The string to be tokenized
119 /// @param delimiter The delimiter that separates the tokens. Can be changed with
120 /// every next token.
121 /// @param skipEmptyTokens If \c true, empty tokens are omitted.
122 /// Optional and defaults to \c false.
123 //==========================================================================================
124 void Set( const TString<TChar>& src, TChar delimiter, bool skipEmptyTokens= false )
125 {
126 Actual = nullptr;
127 Rest = src;
128 this->delim = delimiter;
129 this->skipEmpty = skipEmptyTokens;
130 }
131
132 //==========================================================================================
133 /// Returns the next token, which is afterwards also available through field #Actual.
134 /// If no further token was available, the returned
135 /// \alib{strings;TSubstring;Substring} will be \e nulled.
136 /// (see \alib{strings;TString::IsNull;String::IsNull}).
137 /// To prevent this, the availability of a next token should be
138 /// checked using method #HasNext().
139 ///
140 /// For clarification, see the explanation and sample code in this classes documentation.
141 ///
142 /// @param trimming Determines if the token is trimmed in respect to the white space
143 /// characters defined in field #TrimChars.
144 /// Defaults to \b Whitespaces.Trim.
145 /// @param newDelim The delimiter separates the tokens. Defaults to 0, which keeps the
146 /// current delimiter intact.
147 /// A new delimiter can be provided for every next token.
148 /// @return The next token as \b %Substring. A nulled string is if no next token was
149 /// available.
150 //==========================================================================================
153 TChar newDelim= '\0' );
154
155 //==========================================================================================
156 /// Returns the currently remaining string (without searching for further delimiter
157 /// characters).
158 /// After this call #HasNext will return \c false and #Next will return a \e nulled
159 /// Substring.
160 /// @param trimming Determines if the token is trimmed in respect to the white space
161 /// characters defined in field #TrimChars.
162 /// Defaults to \b Whitespaces.Trim.
163 /// @return The rest of the original source string, which was not returned by #Next(), yet.
164 //==========================================================================================
166 {
167 // set start, end and end of tokenizer
168 Actual= Rest;
169 Rest = nullptr;
170 if ( trimming == lang::Whitespaces::Trim )
171 Actual.Trim( TrimChars );
172 return Actual;
173 }
174
175 //==========================================================================================
176 /// If this returns \c true, a call to #Next will be successful and will return a
177 /// \b Substring which is not \e nulled.
178 /// @return \c true if a next token is available.
179 //==========================================================================================
180 bool HasNext()
181 {
182 return Rest.IsNotNull() && ( !skipEmpty || Rest.IsNotEmpty() );
183 }
184
185}; // class Tokenizer
186
187
190
191}} // namespace alib[::strings::util]
192
193/// Type alias in namespace \b alib.
195
196/// Type alias in namespace \b alib.
198
199/// Type alias in namespace \b alib.
201
202
203
204} // namespace [alib]
205
206
ALIB_DLL TSubstring< TChar > & Next(lang::Whitespaces trimming=lang::Whitespaces::Trim, TChar newDelim='\0')
Definition tokenizer.cpp:26
void Set(const TString< TChar > &src, TChar delimiter, bool skipEmptyTokens=false)
TTokenizer(const TString< TChar > &src, TChar delimiter, bool skipEmptyTokens=false)
TSubstring< TChar > & GetRest(lang::Whitespaces trimming=lang::Whitespaces::Trim)
TTokenizer()
Constructs an empty tokenizer. To initialize, method Set needs to be invoked.
Definition tokenizer.inl:91
TLocalString< character, 8 > TrimChars
Definition tokenizer.inl:70
#define ALIB_DLL
Definition alib.inl:496
#define ALIB_EXPORT
Definition alib.inl:488
Whitespaces
Denotes whether a string is trimmed or not.
@ Trim
Trim whitespaces away.
strings::util::TTokenizer< nchar > TokenizerN
Type alias in namespace alib.
strings::util::TTokenizer< character > Tokenizer
Type alias in namespace alib.
characters::wchar wchar
Type alias in namespace alib.
characters::nchar nchar
Type alias in namespace alib.
strings::util::TTokenizer< wchar > TokenizerW
Type alias in namespace alib.