Merge branch 'devel'
[aoi.git] / src / utils.hxx
blob0747cbfd9b6966c44ec4ad182f6095b69c9e6266
1 /*
2 Copyright 2013 Karel Matas
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 #ifndef __UTILS_HXX
18 #define __UTILS_HXX
20 #include <cmath>
21 #include <algorithm>
22 #include <sstream>
23 #include <string>
24 #include <vector>
25 #include <cstdio>
26 #include <cstring>
27 #include <fstream>
28 #include <FL/fl_utf8.h>
29 #include <exception>
30 #include <stdexcept>
31 #include <set>
32 #include <map>
33 #include <zlib.h>
35 /*! \file utils.hxx
36 * \note http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&item_id=IWS-Appendix
37 * \note http://home.telfort.nl/~t876506/utf8tbl.html
41 using std::vector;
42 using std::string;
44 namespace utils {
46 /*!
47 * Max. length of tagname in parse_markup()
48 * \note should be enough for now (tags without arguments)
50 const int MAX_TAGNAME_LENGTH = 32;
52 class CantOpenFile : public std::exception {
53 private:
54 string msg_;
55 public:
56 CantOpenFile(const string &filename): msg_(filename) {};
57 const char *what() const noexcept {return msg_.c_str();};
60 class ParsingError : public std::exception {
61 private:
62 string msg_;
63 public:
64 ParsingError(const string &filename): msg_(filename) {};
65 const char *what() const noexcept {return msg_.c_str();};
69 //! Histogram. Counts inserted items. Returns items sorted by their key or counts.
70 template <class T>
71 class Histogram
73 private:
74 std::map<T,int> map_;
75 public:
76 Histogram () {};
77 ~Histogram () {};
79 //! Reset the histogram.
80 inline void clear () { map_.clear(); }
82 //! Add an item to the histogram.
83 inline void add ( const T &item ){
84 if ( map_.find(item) == map_.end() )
85 map_[item] = 1;
86 else
87 map_[item]++;
90 //! Add vector of items to the histogram.
91 inline void add ( const vector<T> &v ) {
92 for ( T item: v )
93 add(item);
96 //! Returns items sorted by the key.
97 inline std::map<T,int> map () { return map_; }
99 //! Returns items sorted by their count (ascending order).
100 inline std::multimap<int,T> sorted () {
101 std::multimap<int,T> mm;
102 for ( auto mi: map_ )
103 mm.insert( {mi.second,mi.first} );
104 return mm;
109 * Finds first digit (0-9) in input string.
110 * \param s input string
111 * \return position of first digit in string or length of the string if no digit found
113 inline size_t find_first_digit ( const char *s )
115 size_t pos = 0;
116 while ( pos < strlen(s) ){
117 if ( s[pos] > '0' && s[pos] < '9' )
118 break;
119 pos++;
121 return pos;
126 * Groups similar string in input vector. Similar strings means strings,
127 * which ends with different number.
128 * <b>Example: </b><br>
129 * { "a 1", "d", "b 3", "ddd", "a 4", "b 2" }<br>-><br>
130 * { "a 1", "a 4" }, { "b 2", "b 3" }, { "d" }, { "ddd" }
131 * \param v input vector of strings
132 * \return alphabetically sorted vector of groupped strings
134 vector<vector<string>> separate_groups ( vector<string> &v );
138 * Checks ZLIB error code. In case of an error throws exception with
139 * description of the error.
140 * \exception std::runtime_error
141 * \param err ZLIB error code
143 void check_zlib_error ( int err );
147 * Decompress gzip file, copies file if it is not gzipped.
148 * Throws exception if cant open file. Calss check_zlib_error().
149 * \exception std::runtime_error
150 * \param infilename input file name
151 * \param outfilename output file name
153 void gzip_decompress_file( const char *infilename, const char *outfilename );
157 * Parse string containing integer range and returns minimum and maximum.
158 * Range is separated by -.
159 * <b>Examples</b>
160 * 5+ 5 or more
161 * 3- 3 or less
162 * 4-8 between 4 and 8
163 * Minimal acceptable value is 0. maximal 99.
164 * \param s string containing integer range
165 * \return { minimal_value, maximal_value }
167 inline std::pair<int,int> parse_range ( const string &s )
169 string min = "0";
170 string max = "99";
171 if ( !s.empty() ){
172 size_t pos = s.find("-");
173 if ( s.back() == '+' )
174 min = s.substr( 0, s.size()-1 );
175 else if ( s.back() == '-' )
176 max = s.substr( 0, s.size()-1 );
177 else if ( pos != string::npos ){
178 min = s.substr(0, pos);
179 max = s.substr( pos+1, s.size() );
181 else {
182 min = s;
183 max = s;
186 return {std::stoi(min),std::stoi(max)};
191 * Tries guess filename from url. Filename is created as substring from url,
192 * beginning after position of the last /.
193 * \param url
194 * \return guessed filename or "downloaded.file" when guess failed
196 inline string guess_fname_from_url ( const string &url )
198 size_t pos = url.find_last_of("/")+1;
199 if ( pos != string::npos )
200 return url.substr( pos );
201 else
202 return "downloaded.file";
207 * Convenient function for finding element in the vector.
208 * Equivalent to:
209 * <pre>return std::find( v.begin(), v.end(), elt ) != v.end();</pre>
211 template <class T>
212 inline bool is_in ( const vector<T> &v, const T &elt ){
213 return std::find( v.begin(), v.end(), elt ) != v.end();
218 * Check whether string contains an positive integer.
219 * \return true if s contains only 0-9, false otherwise
221 inline bool isint ( const char *s )
223 for ( size_t i=0; i<strlen(s); i++ )
224 if ( s[i] < '0' || s[i] > '9' )
225 return false;
226 return true;
231 * Reads the string s until character end is found (or \0 until is reached).
232 * Throws an exception if the string is invalid.
233 * \exception ParsingError
234 * \param s [in] input string
235 * \param pos [in,out] starting position in the s, pos is modified in the process
236 * (and contains position of the <b>end</b>)
237 * \param end the input string is read until this character is reached
238 * \return substring starting at input pos and ending before character <b>end</b>
240 inline string read_until ( const char *s, size_t *pos, const char end )
242 char buff[strlen(s)]; // should be enough
243 size_t j = 0;
244 size_t i = *pos;
245 while ( s[i] != end && s[i] != '\0' )
246 buff[j++] = s[i++];
247 if ( s[i] != end )
248 throw ParsingError("read_until(): Invalid string: " + string(s));
249 *pos = ++i;
250 buff[j] = '\0';
251 return buff;
256 * \sa parse_markup()
258 struct TextTag
260 string tag; //!< name of the tag
261 int pos; //!< position in the raw string (returned by parse_markup())
262 int len; //!< length of the tagged text
263 int id;
264 //! for std::sort
265 bool operator< ( const TextTag &rhs ) const { return (this->pos < rhs.pos); };
270 * Parses string with simple markup (without attributes and overlapping).
271 * Example: This is an <tag1>example</tag1> string with <tag2>markup</tag2>.
272 * \sa TextTag
273 * \return String without tags and vector of TextTags.
275 std::pair<string,vector<TextTag>> parse_markup ( const char *s );
278 //! Checks whether file path existst (i.e. can be open).
279 inline bool file_exists ( const char *path )
281 std::ifstream f;
282 f.open(path);
283 bool ret = f.is_open();
284 f.close();
285 return ret;
288 inline bool file_exists ( const string &s ) { return file_exists(s.c_str()); }
292 * Replaces all accurences of the string <i>match</i> in <i>s</i> by <i>repl</i>.
293 * Modifies input string.
294 * \note Dont work with non-ASCII characters (at least with current libstdc++).
295 * \todo utf8
297 inline void replace_all ( string &s, const string &match, const string &repl )
299 size_t pos = 0;
300 while ( (pos = s.find(match, pos)) != string::npos )
301 s.replace( pos, match.size(), repl );
306 * Strips whitespace from the start and end of the input string.
307 * \param s string to be stripped
308 * \return copy of the stripped input string
310 inline string strip ( const char *s )
312 size_t start = 0;
313 size_t end = strlen(s);
314 while ( isspace(s[start]) ) start++;
315 while ( isspace(s[end-1]) && end > start ) end--;
316 return string(s, start, end-start);
321 * Joins elements of the input vector into string with separator sep.
322 * <b>Example:</b>
323 * vector<string> v = { "a", "b", "c" }
324 * to_string(v, ":") returns "a:b:c"
325 * \param v input vector
326 * \param sep string to be used as separator between elements of v in resulting string
327 * \return joined string
329 template <class T=string>
330 inline string to_string ( const vector<T> &v, const char *sep=", " )
332 if ( v.empty() ) return string("");
333 std::stringstream ss;
334 size_t i = 0;
335 for ( auto &s: v )
336 ss << s << ( ( i++ < v.size()-1 ) ? sep:"" );
337 return ss.str();
341 //! Joins elements of the input set into string. See above.
342 template <class T=string>
343 inline string to_string ( const std::set<T> &s, const char *sep=", " )
344 { return to_string<T>( vector<T>(s.begin(),s.end()), sep ); }
348 * Splits string s by delimiters. Delimiters are same as in strtok().
350 vector<string> split_string ( const string &s, const char *delimiters=" " );
353 //! Splits string of integers into vector. See split_string()
354 vector<int> split_string_int ( const string &s, const char *delimiters=" " );
358 * Splits string to chars. E.g. 'hello' -> 'h','e','l','l','o'
359 * \todo check & process errors
361 vector<string> str_to_chars ( const char *s );
365 * Splits string s to chars and returns vector of their utf8 codes.
367 inline std::vector<unsigned int> utf8_to_ints ( const char *s )
369 std::vector<unsigned int> v;
370 size_t i=0;
371 while ( i < strlen(s) ){
372 int len = fl_utf8len(s[i]);;
373 if ( len > 0 ){
374 int dlen = 0;
375 unsigned int dec = fl_utf8decode( s+i, s+i+len ,&dlen);
376 if ( len != dlen )
377 printf("UTF8 decoding error: i: %d, len: %d, dlen: %d", i, len, dlen);
378 v.push_back( dec );
379 i += len;
381 else
382 i++;
384 return v;
387 } // namespace utils
388 #endif // __UTILS_HXX