ALib C++ Library
Library Version: 2511 R0
Documentation generated by doxygen
Loading...
Searching...
No Matches
tokenizer.inl
Go to the documentation of this file.
1//==================================================================================================
2/// \file
3/// This header-file is part of module \alib_strings of the \aliblong.
4///
5/// \emoji :copyright: 2013-2025 A-Worx GmbH, Germany.
6/// Published under \ref mainpage_license "Boost Software License".
7//==================================================================================================
8ALIB_EXPORT namespace alib { namespace strings {
9
10/// This sub-namespace provides some utility classes which are related
11/// to string classes found in namespace \ref alib::strings.
12namespace util {
13
14//==================================================================================================
15/// This class operates on strings which contains data separated by a delimiter character.
16/// It identifies the substrings between the delimiters as \e tokens of type
17/// \alib{strings;TSubstring;Substring}. After an instance of this class is constructed,
18/// three methods are available:
19/// - #HasNext: Indicates if there are further tokens available.
20/// - #Next: Sets field #Actual (which is of type \b Substring) to reference the next token and
21/// returns it.<br>
22/// With each call to %Next, a different delimiter can be provided, which then serves as the
23/// delimiter for this and subsequent tokens.<br>
24/// The returned token by default will be trimmed according to the current trimable characters.
25/// - #Rest:
26/// Like #Next, however returns the complete remaining region without searching for
27/// further delimiters (and tokens).<br>
28/// After this method was invoked, #HasNext() will return \c false.
29///
30/// After a token was retrieved, it might be modified using the interface of class
31/// \alib{strings;TSubstring;Substring} as the tokenizer does not rely on the bounds of
32/// the current token when receiving the next.
33///
34/// Objects of this class can be reused by freshly initializing them by using method #Set.
35/// Furthermore, even the field #Rest is allowed to be changed using the interface of
36/// \b %Substring if it seems appropriate. The effect is the same as if the method #Set was
37/// invoked to apply a different source string.
38///
39/// <b>Sample code</b>:<br>
40/// The following code sample shows how to tokenize a string:
41///
42/// \snippet "DOX_TOKENIZER.cpp" DOX_TOKENIZER
43///
44/// The output will be:
45///
46/// \verbinclude "DOX_TOKENIZER.txt"
47///
48/// @tparam TChar The character type. Implementations for \c nchar and \c wchar are provided
49/// with type definitions \ref alib::TokenizerN and
50/// \ref alib::TokenizerW.
51//==================================================================================================
52template<typename TChar>
54{
55 //################################################################################################
56 // Public fields
57 //################################################################################################
58 public:
59 /// A \alib{strings;TSubstring;Substring} that represents the part of
60 /// the underlying data that has not been tokenized, yet.
61 /// It is allowed to manipulate this public field, which has a similar effect as
62 /// using method #Set.<br>
64
65 /// The actual token, which is returned with every invocation of #Next() or #Rest().
66 /// It is allowed to manipulate this field any time.<br>
68
69 /// The white spaces characters used to trim the tokens.
70 /// Defaults to \ref alib::DEFAULT_WHITESPACES
72
73
74 //################################################################################################
75 // Internal fields
76 //################################################################################################
77 protected:
78 /// The most recently set delimiter used by default for the next token extraction.
79 TChar delim;
80
81 /// If \c true, empty tokens are omitted.
83
84
85 //################################################################################################
86 // Constructors/Destructor
87 //################################################################################################
88 public:
89 /// Constructs an empty tokenizer. To initialize, method #Set needs to be invoked.
91
92 /// Constructs a tokenizer to work on a given string.
93 ///
94 /// @param src The string to be tokenized.
95 /// @param delimiter The delimiter that separates the tokens. Can be changed with
96 /// every next token.
97 /// @param skipEmptyTokens If \c true, empty tokens are omitted.
98 /// Optional and defaults to \c false.
99 TTokenizer( const TString<TChar>& src, TChar delimiter, bool skipEmptyTokens= false )
100 : Rest (src)
101 , Actual(nullptr)
102 , TrimChars( CStringConstantsTraits<TChar>::DefaultWhitespaces() )
103 , delim(delimiter)
104 , skipEmpty(skipEmptyTokens) {}
105
106 //################################################################################################
107 // Interface
108 //################################################################################################
109 public:
110 /// Resets a tokenizer to work on a given string.
111 ///
112 /// @param src The string to be tokenized
113 /// @param delimiter The delimiter that separates the tokens. Can be changed with
114 /// every next token.
115 /// @param skipEmptyTokens If \c true, empty tokens are omitted.
116 /// Optional and defaults to \c false.
117 void Set( const TString<TChar>& src, TChar delimiter, bool skipEmptyTokens= false ) {
118 Actual = nullptr;
119 Rest = src;
120 this->delim = delimiter;
121 this->skipEmpty = skipEmptyTokens;
122 }
123
124 /// Returns the next token, which is afterwards also available through field #Actual.
125 /// If no further token was available, the returned
126 /// \alib{strings;TSubstring;Substring} will be \e nulled.
127 /// (see \alib{strings;TString::IsNull;String::IsNull}).
128 /// To prevent this, the availability of a next token should be
129 /// checked using method #HasNext().
130 ///
131 /// For clarification, see the explanation and sample code in this classes documentation.
132 ///
133 /// @param trimming Determines if the token is trimmed in respect to the white space
134 /// characters defined in field #TrimChars.
135 /// Defaults to \b Whitespaces.Trim.
136 /// @param newDelim The delimiter separates the tokens. Defaults to 0, which keeps the
137 /// current delimiter intact.
138 /// A new delimiter can be provided for every next token.
139 /// @return The next token as \b %Substring. A nulled string is if no next token was
140 /// available.
143 TChar newDelim= '\0' );
144
145 /// Returns the currently remaining string (without searching for further delimiter
146 /// characters).
147 /// After this call #HasNext will return \c false and #Next will return a \e nulled
148 /// Substring.
149 /// @param trimming Determines if the token is trimmed in respect to the white space
150 /// characters defined in field #TrimChars.
151 /// Defaults to \b Whitespaces.Trim.
152 /// @return The rest of the original source string, which was not returned by #Next(), yet.
154 // set start, end and end of tokenizer
155 Actual= Rest;
156 Rest = nullptr;
157 if ( trimming == lang::Whitespaces::Trim )
158 Actual.Trim( TrimChars );
159 return Actual;
160 }
161
162 /// If this returns \c true, a call to #Next will be successful and will return a
163 /// \b Substring which is not \e nulled.
164 /// @return \c true if a next token is available.
165 bool HasNext() { return Rest.IsNotNull() && ( !skipEmpty || Rest.IsNotEmpty() ); }
166
167}; // class Tokenizer
168
169
172
173}} // namespace alib[::strings::util]
174
175/// Type alias in namespace \b alib.
177
178/// Type alias in namespace \b alib.
180
181/// Type alias in namespace \b alib.
183
184
185
186} // namespace [alib]
ALIB_DLL TSubstring< TChar > & Next(lang::Whitespaces trimming=lang::Whitespaces::Trim, TChar newDelim='\0')
Definition tokenizer.cpp:26
void Set(const TString< TChar > &src, TChar delimiter, bool skipEmptyTokens=false)
TTokenizer(const TString< TChar > &src, TChar delimiter, bool skipEmptyTokens=false)
Definition tokenizer.inl:99
TSubstring< TChar > & GetRest(lang::Whitespaces trimming=lang::Whitespaces::Trim)
TTokenizer()
Constructs an empty tokenizer. To initialize, method Set needs to be invoked.
Definition tokenizer.inl:90
TLocalString< character, 8 > TrimChars
Definition tokenizer.inl:71
#define ALIB_DLL
Definition alib.inl:503
#define ALIB_EXPORT
Definition alib.inl:497
Whitespaces
Denotes whether a string is trimmed or not.
@ Trim
Trim whitespaces away.
strings::util::TTokenizer< nchar > TokenizerN
Type alias in namespace alib.
strings::util::TTokenizer< character > Tokenizer
Type alias in namespace alib.
characters::wchar wchar
Type alias in namespace alib.
characters::nchar nchar
Type alias in namespace alib.
strings::util::TTokenizer< wchar > TokenizerW
Type alias in namespace alib.