Web Wispers 1.2.2 C++ Unit Tests - ssrc/wispers/utility/WebStrings.cc

Web Wispers 1.2.2 C++ Unit Test Coverage

Current view:	top level - ssrc/wispers/utility - WebStrings.cc (source / functions)		Hit	Total	Coverage
Test:	Web Wispers 1.2.2 C++ Unit Tests	Lines:	216	218	99.1 %
Date:	2012-04-09	Functions:	24	26	92.3 %
		Branches:	180	254	70.9 %

           Branch data     Line data    Source code

       1                 :            : /*
       2                 :            :  * Copyright 2006-2009 Savarese Software Research Corporation
       3                 :            :  *
       4                 :            :  * Licensed under the Apache License, Version 2.0 (the "License");
       5                 :            :  * you may not use this file except in compliance with the License.
       6                 :            :  * You may obtain a copy of the License at
       7                 :            :  *
       8                 :            :  *     https://www.savarese.com/software/ApacheLicense-2.0
       9                 :            :  *
      10                 :            :  * Unless required by applicable law or agreed to in writing, software
      11                 :            :  * distributed under the License is distributed on an "AS IS" BASIS,
      12                 :            :  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      13                 :            :  * See the License for the specific language governing permissions and
      14                 :            :  * limitations under the License.
      15                 :            :  */
      16                 :            : #include <unordered_map>
      17                 :            : #include <algorithm>
      18                 :            : #include <cstring>
      19                 :            : #include <boost/algorithm/string/case_conv.hpp>
      20                 :            : #include <boost/algorithm/string/trim.hpp>
      21                 :            : #include <boost/regex.hpp>
      22                 :            : 
      23                 :            : #include <ssrc/wispers/utility/WebStrings.h>
      24                 :            : 
      25                 :            : // Note: we perform two passes on input, calculating the size of the
      26                 :            : // destination buffer on the first pass and performing the
      27                 :            : // substitutions on the second pass.  We could do this in one pass, by
      28                 :            : // appending to a dynamic buffer or storing references to ranges of
      29                 :            : // the input that do not have to be altered (followed by a loop over
      30                 :            : // the ranges, writing to the output).  However, the alternatives
      31                 :            : // would require O(N) dynamic memory allocations (either to resize the
      32                 :            : // buffer or to record a new range), where N is the number of
      33                 :            : // substitutions required by the input.  We assume that with today's
      34                 :            : // large memory caches (and given that our input will almost always be
      35                 :            : // far less than 1 MB), it is cheaper for us to do two passes (the
      36                 :            : // first pass loads the data into the cache) and a single memory
      37                 :            : // allocation with individual character copies than one pass with N
      38                 :            : // memory allocations (possibly followed by a second loop to write the
      39                 :            : // output) and N memcpy's (even though the memcpy's will be faster than
      40                 :            : // our individual-character-copying loop).
      41                 :            : //
      42                 :            : // Also note that we could genercize the algorithms to share code
      43                 :            : // between them, using functors to specialize the behavior.  We
      44                 :            : // choose not to do so for expediency of implementation.  It
      45                 :            : // takes more time to figure out how to properly genericize the
      46                 :            : // code than it does to implement the utility functions independently.
      47                 :            : //
      48                 :            : // Regardless, performance optimizations and design refinements can
      49                 :            : // always be made in the future.  Functionality comes first.
      50                 :            : 
      51                 :            : namespace {
      52                 :            :   // TODO: optimize patterns.
      53                 :          4 :   const boost::regex html_strip_pattern("<!--.*?-->|<[^>]*>");
      54                 :          4 :   const boost::regex html_strip_amp_pattern("<!--.*?-->|<[^>]*>|&[^#&;\\d\\s<>]+;|&#\\d+;|&#[xX][a-fA-F\\d]+;");
      55                 :            : 
      56                 :            :   // begin html_title_and_body support
      57                 :          4 :   const boost::regex html_title_begin("<title[^>]*>",
      58                 :            :                                       boost::regex_constants::normal |
      59                 :            :                                       boost::regex_constants::icase);
      60                 :          4 :   const boost::regex html_title_end("</title>",
      61                 :            :                                       boost::regex_constants::normal |
      62                 :            :                                       boost::regex_constants::icase);
      63                 :          4 :   const boost::regex html_body_begin("<body[^>]*>",
      64                 :            :                                      boost::regex_constants::normal |
      65                 :            :                                      boost::regex_constants::icase);
      66                 :          4 :   const boost::regex html_body_end("</body>",
      67                 :            :                                    boost::regex_constants::normal |
      68                 :            :                                    boost::regex_constants::icase);
      69                 :            :   // end html_title_and_body support
      70                 :            : 
      71                 :            :   const char hex_char[] = {
      72                 :            :     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
      73                 :            :     'a', 'b', 'c', 'd', 'e', 'f'
      74                 :            :   };
      75                 :            : 
      76                 :            :   // Sorted array of characters that are either reserved according to
      77                 :            :   // RFC 1738 or "unsafe". 
      78                 :            :   // This array is pre-sorted so that we can search it via a binary search.
      79                 :            :   const char url_reserved_char[] = {
      80                 :            :     ' ', '"', '#', '$', '%', '&', '\'', '+', ',', '/', ':', ';', '<', '=', '>',
      81                 :            :     '?', '@', '[', '\\', ']', '^', '`', '{', '|', '}', '~'
      82                 :            :   };
      83                 :            : 
      84                 :            :   // This array is pre-sorted so that we can search it via a binary search.
      85                 :            :   // For such a small number of items, linear search or
      86                 :            :   // compiler-optimized switch statement may be faster!
      87                 :            :   const char html_escape_char[] = {
      88                 :            :     '"', '&', '\'', '<', '>'
      89                 :            :   };
      90                 :            : 
      91                 :            :   const char html_quot[]  = { '&', 'q', 'u', 'o', 't', ';' };
      92                 :            :   const char html_amp[]   = { '&', 'a', 'm', 'p', ';' };
      93                 :            :   const char html_apost[] = { '&', '#', '3', '9', ';' };
      94                 :            :   const char html_lt[]    = { '&', 'l', 't', ';' };
      95                 :            :   const char html_gt[]    = { '&', 'g', 't', ';' };
      96                 :            : 
      97                 :            :   struct HTMLEscapeInfo {
      98                 :            :     const char *escape_str;
      99                 :            :     const unsigned int size;
     100                 :            :   };
     101                 :            : 
     102                 :            :   const HTMLEscapeInfo html_escape_info[] = {
     103                 :            :     { 0, 1 },
     104                 :            :     { html_quot, sizeof(html_quot) },
     105                 :            :     { html_amp, sizeof(html_amp) },
     106                 :            :     { html_apost, sizeof(html_apost) },
     107                 :            :     { html_lt, sizeof(html_lt) },
     108                 :            :     { html_gt, sizeof(html_gt) }
     109                 :            :   };
     110                 :            : 
     111                 :            :   struct HTMLEntity {
     112                 :            :     const char * const entity;
     113                 :            :     const unsigned char value;
     114                 :            :   };
     115                 :            : 
     116                 :            :   const HTMLEntity html_entity[] = {
     117                 :            :     { "agrave", 0xe0 },
     118                 :            :     { "aacute", 0xe1 },
     119                 :            :     { "acirc", 0xe2 },
     120                 :            :     { "atilde", 0xe3 },
     121                 :            :     { "auml", 0xe4 },
     122                 :            :     { "aring", 0xe5 },
     123                 :            :     { "aelig", 0xe6 },
     124                 :            :     { "amp", '&' },
     125                 :            :     { "ccedil", 0xe7 },
     126                 :            :     { "copy", 0xa9 },
     127                 :            :     { "egrave", 0xe8 },
     128                 :            :     { "eacute", 0xe9 },
     129                 :            :     { "ecirc", 0xea },
     130                 :            :     { "euml", 0xeb },
     131                 :            :     { "eth", 0xf0 },
     132                 :            :     { "igrave", 0xec },
     133                 :            :     { "iacute", 0xed },
     134                 :            :     { "icirc", 0xee },
     135                 :            :     { "iuml", 0xef },
     136                 :            :     { "gt", '>' },
     137                 :            :     { "lt", '<' },
     138                 :            :     { "ntilde", 0xf1 },
     139                 :            :     { "nbsp", ' ' },
     140                 :            :     { "ograve", 0xf2 },
     141                 :            :     { "oacute", 0xf3 },
     142                 :            :     { "ocirc", 0xf4 },
     143                 :            :     { "otilde", 0xf5 },
     144                 :            :     { "ouml", 0xf6 },
     145                 :            :     { "oslash", 0xf8 },
     146                 :            :     { "quot", '"' },
     147                 :            :     { "szlig", 0xdf },
     148                 :            :     { "thorn", 0xfe },
     149                 :            :     { "ugrave", 0xf9 },
     150                 :            :     { "uacute", 0xfa },
     151                 :            :     { "ucirc", 0xfb },
     152                 :            :     { "uuml", 0xfc },
     153                 :            :     { "yacute", 0xfd }
     154                 :            :   };
     155                 :            : 
     156                 :            :   const unsigned int html_entity_size = sizeof(html_entity)/sizeof(HTMLEntity);
     157                 :            : 
     158                 :          0 :   class HTMLEntityMap {
     159                 :            :     typedef std::unordered_map<std::string, const unsigned char> map_type;
     160                 :            :     map_type _map;
     161                 :            : 
     162                 :            :   public:
     163                 :            : 
     164         [ +  - ]:          4 :     HTMLEntityMap() {
     165         [ +  + ]:        152 :       for(unsigned int i = 0; i < html_entity_size; ++i) {
     166                 :            :         _map.insert(map_type::value_type(html_entity[i].entity,
     167   [ +  -  +  -  :        148 :                                          html_entity[i].value));
                   +  - ]
     168                 :            :       }
     169                 :          4 :     }
     170                 :            : 
     171                 :          3 :     unsigned char get(const std::string & key) const {
     172                 :          3 :       map_type::const_iterator it = _map.find(key);
     173                 :            : 
     174         [ -  + ]:          3 :       return (it == _map.end() ? 0 : it->second);
     175                 :            :     }
     176                 :            :   };
     177                 :            : 
     178   [ -  +  #  # ]:          4 :   const HTMLEntityMap html_entity_map;
     179                 :            : 
     180                 :         60 :   inline bool is_reserved_url_char(const char c) {
     181                 :            :     return
     182                 :            :       (c < 32 || c > 122 ||
     183                 :            :        std::binary_search(&url_reserved_char[0],
     184   [ +  +  +  +  :         60 :                           &url_reserved_char[sizeof(url_reserved_char)], c));
                   +  + ]
     185                 :            :   }
     186                 :            : 
     187                 :         13 :   inline unsigned int html_escape_char_size(const char c) {
     188   [ +  +  +  +  :         13 :     switch(c) {
                   +  + ]
     189                 :          1 :     case '"' : return sizeof(html_quot);
     190                 :          1 :     case '&' : return sizeof(html_amp);
     191                 :          1 :     case '\'': return sizeof(html_apost);
     192                 :          1 :     case '<' : return sizeof(html_lt);
     193                 :          1 :     case '>' : return sizeof(html_gt);
     194                 :          8 :     default  : return 1;
     195                 :            :     };
     196                 :            :   }
     197                 :            : 
     198                 :         13 :   inline unsigned int html_escape_char_index(const char c) {
     199   [ +  +  +  +  :         13 :     switch(c) {
                   +  + ]
     200                 :          1 :     case '"' : return 1;
     201                 :          1 :     case '&' : return 2;
     202                 :          1 :     case '\'': return 3;
     203                 :          1 :     case '<' : return 4;
     204                 :          1 :     case '>' : return 5;
     205                 :          8 :     default  : return 0;
     206                 :            :     };
     207                 :            :   }
     208                 :            : 
     209                 :            :   /*
     210                 :            :   inline bool is_html_escape_char(const char c) {
     211                 :            :     return (html_escape_char_size(c) > 1);
     212                 :            :   }
     213                 :            : 
     214                 :            :   inline bool is_html_escape_char(const char c) {
     215                 :            :     return std::binary_search(&html_escape_char[0],
     216                 :            :                               &html_escape_char[sizeof(html_escape_char)], c);
     217                 :            :   }
     218                 :            :   */
     219                 :            : 
     220                 :          9 :   inline char * escape_javascript_char(char *dest, const char c) {
     221                 :          9 :     *dest++ = '\\';
     222                 :          9 :     *dest++ = 'x';
     223                 :          9 :     *dest++ = hex_char[((c >> 4) & 0x0f)];
     224                 :          9 :     *dest++ = hex_char[(c & 0x0f)];
     225                 :          9 :     return dest;
     226                 :            :   }
     227                 :            : 
     228                 :         27 :   inline char * escape_url_char(char *dest, const char c) {
     229                 :         27 :     *dest++ = '%';
     230                 :         27 :     *dest++ = hex_char[((c >> 4) & 0x0f)];
     231                 :         27 :     *dest++ = hex_char[(c & 0x0f)];
     232                 :         27 :     return dest;
     233                 :            :   }
     234                 :            : 
     235                 :          1 :   inline unsigned int _escaped_javascript_size(const char *src,
     236                 :            :                                                const unsigned int src_size)
     237                 :            :   {
     238                 :          1 :     unsigned int pos = 0, escaped_size = 0;
     239                 :            : 
     240   [ +  +  +  -  :         18 :     while(*src && pos++ < src_size) {
                   +  + ]
     241         [ +  + ]:         16 :       if(*src >= 32) {
     242         [ +  + ]:         15 :         switch(*src) {
     243                 :            :         case '"':
     244                 :            :         case '&':
     245                 :            :         case '\'':
     246                 :            :         case '/':
     247                 :            :         case ';':
     248                 :            :         case '<':
     249                 :            :         case '>':
     250                 :          8 :         case '\\': escaped_size+=4; break;
     251                 :          7 :         default: ++escaped_size; break;
     252                 :            :         };
     253                 :            :       } else {
     254                 :          1 :         escaped_size+=4;
     255                 :            :       }
     256                 :            : 
     257                 :         16 :       ++src;
     258                 :            :     }
     259                 :            : 
     260                 :          1 :     return escaped_size;
     261                 :            :   }
     262                 :            : 
     263                 :          1 :   inline void _escape_javascript(char *dest, const char *src,
     264                 :            :                                  const unsigned int src_size)
     265                 :            :   {
     266                 :          1 :     unsigned int pos = 0;
     267                 :            : 
     268   [ +  +  +  -  :         18 :     while(*src && pos++ < src_size) {
                   +  + ]
     269         [ +  + ]:         16 :       if(*src >= 32) {
     270         [ +  + ]:         15 :         switch(*src) {
     271                 :            :         case '"':
     272                 :            :         case '&':
     273                 :            :         case '\'':
     274                 :            :         case '/':
     275                 :            :         case ';':
     276                 :            :         case '<':
     277                 :            :         case '>':
     278                 :          8 :         case '\\': dest = escape_javascript_char(dest, *src); break;
     279                 :          7 :         default: *dest++ = *src; break;
     280                 :            :         };
     281                 :            :       } else {
     282                 :          1 :         dest = escape_javascript_char(dest, *src);
     283                 :            :       }
     284                 :            : 
     285                 :         16 :       ++src;
     286                 :            :     }
     287                 :          1 :   }
     288                 :            : 
     289                 :            : 
     290                 :          1 :   inline unsigned int _escaped_html_size(const char *src,
     291                 :            :                                          const unsigned int src_size)
     292                 :            :   {
     293                 :          1 :     unsigned int pos = 0, escaped_size = 0;
     294                 :            : 
     295   [ +  +  +  -  :         15 :     while(*src && pos++ < src_size) {
                   +  + ]
     296                 :         13 :       escaped_size+=html_escape_char_size(*src);
     297                 :         13 :       ++src;
     298                 :            :     }
     299                 :            : 
     300                 :          1 :     return escaped_size;
     301                 :            :   }
     302                 :            : 
     303                 :          1 :   inline void _escape_html(char *dest, const char *src,
     304                 :            :                            const unsigned int src_size)
     305                 :            :   {
     306                 :          1 :     unsigned int pos = 0;
     307                 :            : 
     308   [ +  +  +  -  :         15 :     while(*src && pos++ < src_size) {
                   +  + ]
     309                 :         13 :       const unsigned int index = html_escape_char_index(*src);
     310                 :            : 
     311         [ +  + ]:         13 :       if(index == 0) {
     312                 :          8 :         *dest++ = *src;
     313                 :            :       } else {
     314                 :          5 :         const HTMLEscapeInfo & info = html_escape_info[index];
     315                 :          5 :         std::memcpy(dest, info.escape_str, info.size);
     316                 :          5 :         dest+=info.size;
     317                 :            :       }
     318                 :            : 
     319                 :         13 :       ++src;
     320                 :            :     }
     321                 :          1 :   }
     322                 :            : 
     323                 :          1 :   inline unsigned int _escaped_url_size(const char *src,
     324                 :            :                                         const unsigned int src_size)
     325                 :            :   {
     326                 :          1 :     unsigned int pos = 0, escaped_size = 0;
     327                 :            : 
     328   [ +  +  +  -  :         32 :     while(*src && pos++ < src_size) {
                   +  + ]
     329         [ +  + ]:         30 :       escaped_size += (is_reserved_url_char(*src++) ? 3 : 1);
     330                 :            :     }
     331                 :            : 
     332                 :          1 :     return escaped_size;
     333                 :            :   }
     334                 :            : 
     335                 :          1 :   inline void _escape_url(char *dest, const char *src,
     336                 :            :                           const unsigned int src_size)
     337                 :            :   {
     338                 :          1 :     unsigned int pos = 0;
     339                 :            : 
     340   [ +  +  +  -  :         32 :     while(*src && pos++ < src_size) {
                   +  + ]
     341         [ +  + ]:         30 :       if(is_reserved_url_char(*src)) {
     342                 :         27 :         dest = escape_url_char(dest, *src);
     343                 :            :       } else {
     344                 :          3 :         *dest++ = *src;
     345                 :            :       }
     346                 :         30 :       ++src;
     347                 :            :     }
     348                 :          1 :   }
     349                 :            : 
     350                 :          5 :   inline char * unescape_char_entity(char *dest, const char *src,
     351                 :            :                                      const unsigned int src_size)
     352                 :            :   {
     353                 :          5 :     const char * const begin = src + 1;
     354                 :          5 :     long value = 0;
     355                 :            : 
     356                 :            :     // We can't place a temporary null guard in src because it may
     357                 :            :     // be a read-only memory-mapped region.  We trust strtol to end
     358                 :            :     // when it encounters the terminating colon.
     359         [ +  + ]:          5 :     if(*begin == '#') {
     360         [ +  + ]:          2 :       if(std::tolower(*(begin + 1)) == 'x') {
     361                 :          1 :         value = strtol(begin + 2, 0, 16);
     362                 :            :       } else {
     363                 :          1 :         value = strtol(begin + 1, 0, 10);
     364                 :            :       }
     365                 :            :     } else {
     366         [ +  - ]:          6 :       std::string key(begin, src_size - 2);
     367         [ +  - ]:          3 :       boost::to_lower(key);
     368         [ +  - ]:          3 :       value = html_entity_map.get(key);
     369                 :            :     }
     370                 :            : 
     371   [ +  -  +  - ]:          5 :     if(value > 0 && value < 256) {
     372                 :          5 :       *dest = value;
     373                 :          5 :       return (dest + 1);
     374                 :            :     }
     375                 :            : 
     376                 :          0 :     return dest;
     377                 :            :   }
     378                 :            : }
     379                 :            : 
     380                 :            : __BEGIN_NS_SSRC_WSPR_UTILITY
     381                 :            : 
     382                 :            : /*
     383                 :            :  * Escapes characters in text that could result in executing
     384                 :            :  * JavaScript code when passed to a JavaScript interpreter.
     385                 :            :  * In addition, all leading and trailing whitespace is removed.
     386                 :            :  *
     387                 :            :  * @param result Buffer in which to store escaped text.  The
     388                 :            :  *        string will be resized to hold the text.
     389                 :            :  * @param text The text to be escaped.
     390                 :            :  * @param text_size The length of the unescaped text.
     391                 :            :  */
     392                 :          1 : void escape_javascript(string & result, const char *text,
     393                 :            :                        const unsigned int text_size)
     394                 :            : {
     395                 :          1 :   result.resize(_escaped_javascript_size(text, text_size));
     396                 :          1 :   _escape_javascript(&result[0], text, text_size);
     397         [ +  - ]:          1 :   boost::algorithm::trim(result);
     398                 :          1 : }
     399                 :            : 
     400                 :            : /**
     401                 :            :  * Escapes characters in text that could result in rendering HTML when
     402                 :            :  * passed to an HTML renderer.  In addition, all leading and trailing
     403                 :            :  * whitespace is removed.
     404                 :            :  *
     405                 :            :  * @param result Buffer in which to store escaped text.  The
     406                 :            :  *        string will be resized to hold the text.
     407                 :            :  * @param text The text to be escaped.
     408                 :            :  * @param text_size The length of the unescaped text.
     409                 :            :  */
     410                 :          1 : void escape_html(string & result, const char *text,
     411                 :            :                  const unsigned int text_size)
     412                 :            : {
     413                 :          1 :   result.resize(_escaped_html_size(text, text_size));
     414                 :          1 :   _escape_html(&result[0], text, text_size);
     415         [ +  - ]:          1 :   boost::algorithm::trim(result);
     416                 :          1 : }
     417                 :            : 
     418                 :            : /**
     419                 :            :  * Escapes characters in text that are either reserved or "unsafe"
     420                 :            :  * according to RFC 1738.  Trailing spaces are escaped and not trimmed.
     421                 :            :  *
     422                 :            :  * @param result Buffer in which to store escaped text.  The
     423                 :            :  *        string will be resized to hold the text.
     424                 :            :  * @param text The text to be escaped.
     425                 :            :  * @param text_size The length of the unescaped text.
     426                 :            :  */
     427                 :          1 : void escape_url(string & result, const char *text,
     428                 :            :                 const unsigned int text_size)
     429                 :            : {
     430                 :          1 :   result.resize(_escaped_url_size(text, text_size));
     431                 :          1 :   _escape_url(&result[0], text, text_size);
     432                 :          1 : }
     433                 :            : 
     434                 :            : /**
     435                 :            :  * Unescapes URL escape sequences in place.
     436                 :            :  *
     437                 :            :  * @param url The URL to unescape.
     438                 :            :  */
     439                 :         15 : void unescape_url(string & url) {
     440                 :            :   char d0, d1;
     441                 :         15 :   string::iterator it = url.begin(), end = url.end(), next = it;
     442                 :            : 
     443         [ +  + ]:         71 :   for(;next != end; ++it, ++next) {
     444         [ +  + ]:         56 :     if(*next == '+') {
     445                 :          1 :       *it = ' ';
     446   [ +  +  +  -  :         87 :     } else if(*next == '%' && (end - next) > 2
          +  -  +  -  +  
                      + ]
     447                 :         16 :               && std::isxdigit((d0 = *(next + 1)))
     448                 :         16 :               && std::isxdigit((d1 = *(next + 2))))
     449                 :            :     {
     450                 :            : #define FROM_HEX(c) (((c) >= 'A') ? (((c) & 0xdf) - 'A' + 10) : ((c) - '0'))
     451         [ -  + ]:          8 :       char num = FROM_HEX(d0);
     452                 :          8 :       num*=16;
     453         [ +  + ]:          8 :       num+=FROM_HEX(d1);
     454                 :            : #undef FROM_HEX
     455                 :          8 :       *it = num;
     456                 :          8 :       next+=2;
     457         [ +  + ]:         47 :     } else if(it != next) {
     458                 :          4 :       *it = *next;
     459                 :            :     }
     460                 :            :   }
     461                 :            : 
     462         [ +  + ]:         15 :   if(it != next) {
     463                 :          3 :     url.resize(it - url.begin());
     464                 :            :   }
     465                 :         15 : }
     466                 :            : 
     467                 :            : /**
     468                 :            :  * Removes all HTML tags from text.
     469                 :            :  *
     470                 :            :  * @param result Buffer in which to store stripped text.  The
     471                 :            :  *        string will be resized to hold the text.
     472                 :            :  * @param text The text to be stripped.
     473                 :            :  * @param text_size The length of the unstripped text.
     474                 :            :  */
     475                 :          2 : void strip_html(string & result, const char *text,
     476                 :            :                 const unsigned int text_size)
     477                 :            : 
     478                 :            : {
     479                 :          4 :   const boost::cregex_iterator end;
     480   [ +  -  +  - ]:          4 :   boost::cregex_iterator it(text, text + text_size, html_strip_pattern);
     481                 :          2 :   const char *src = text;
     482                 :            : 
     483         [ +  - ]:          2 :   result.resize(text_size, ' ');
     484                 :            : 
     485         [ +  - ]:          2 :   char *dest = &result[0];
     486                 :            : 
     487   [ +  -  +  + ]:         15 :   while(it != end) {
     488         [ +  - ]:         11 :     const boost::cregex_iterator::reference m = *it;
     489         [ +  - ]:         11 :     const boost::cregex_iterator::difference_type size = m[0].first - src;
     490                 :            : 
     491         [ +  + ]:         11 :     if(size > 0) {
     492                 :          5 :       std::memcpy(dest, src, size);
     493                 :          5 :       dest+=size;
     494                 :            :     }
     495                 :            : 
     496         [ +  - ]:         11 :     src = m[0].second;
     497         [ +  - ]:         11 :     ++it;
     498                 :            :   }
     499                 :            : 
     500                 :          2 :   const int tail_size = text_size - (src - text);
     501         [ +  + ]:          2 :   if(tail_size > 0) {
     502                 :          1 :     std::memcpy(dest, src, tail_size);
     503                 :            :   }
     504                 :            : 
     505         [ +  - ]:          2 :   boost::algorithm::trim(result);
     506                 :          2 : }
     507                 :            : 
     508                 :            : /**
     509                 :            :  * Removes all HTML tags from text and unescapes character entities,
     510                 :            :  * converting them into the characters they represent.
     511                 :            :  *
     512                 :            :  * @param result Buffer in which to store stripped text.  The
     513                 :            :  *        string will be resized to hold the text.
     514                 :            :  * @param text The text to be stripped.
     515                 :            :  * @param text_size The length of the unstripped text.
     516                 :            :  */
     517                 :          1 : void strip_html_and_unescape(string & result, const char *text,
     518                 :            :                              const unsigned int text_size)
     519                 :            : {
     520                 :          2 :   const boost::cregex_iterator end;
     521   [ +  -  +  - ]:          2 :   boost::cregex_iterator it(text, text + text_size, html_strip_amp_pattern);
     522                 :          1 :   const char *src = text;
     523                 :            : 
     524         [ +  - ]:          1 :   result.resize(text_size, ' ');
     525                 :            : 
     526         [ +  - ]:          1 :   char *dest = &result[0];
     527                 :            : 
     528   [ +  -  +  + ]:         18 :   while(it != end) {
     529         [ +  - ]:         16 :     const boost::cregex_iterator::reference m = *it;
     530         [ +  - ]:         16 :     const char *first = m[0].first;
     531                 :         16 :     const boost::cregex_iterator::difference_type size = first - src;
     532                 :            : 
     533         [ +  + ]:         16 :     if(size > 0) {
     534                 :          7 :       std::memcpy(dest, src, size);
     535                 :          7 :       dest+=size;
     536                 :            :     }
     537                 :            : 
     538         [ +  + ]:         16 :     if(*first == '&') {
     539   [ +  -  +  -  :          5 :       dest = unescape_char_entity(dest, first, m[0].length());
                   +  - ]
     540                 :            :     }
     541                 :            : 
     542         [ +  - ]:         16 :     src = m[0].second;
     543         [ +  - ]:         16 :     ++it;
     544                 :            :   }
     545                 :            : 
     546                 :          1 :   const int tail_size = text_size - (src - text);
     547         [ +  - ]:          1 :   if(tail_size > 0) {
     548                 :          1 :     std::memcpy(dest, src, tail_size);
     549                 :            :   }
     550                 :            : 
     551         [ +  - ]:          1 :   boost::algorithm::trim(result);
     552                 :          1 : }
     553                 :            : 
     554                 :            : /**
     555                 :            :  * Replaces spaces in text with newlines in such a manner that no line
     556                 :            :  * exceeds a specified maximum length, except for sequences of characters
     557                 :            :  * uninterrupted by spaces that exceed the maximum line length.
     558                 :            :  *
     559                 :            :  * @param text The text to be line-wrapped.
     560                 :            :  * @param text_size The length of the text.
     561                 :            :  * @param max_length The maximum line length.
     562                 :            :  */
     563                 :          4 : void wrap_lines(char *text, const unsigned int text_size,
     564                 :            :                 const unsigned int max_length)
     565                 :            : {
     566                 :          4 :   char * const end = text + text_size;
     567                 :          4 :   char * pos = text;
     568                 :          4 :   unsigned int length = 0;
     569                 :            : 
     570         [ +  + ]:         16 :   while(pos < end) {
     571                 :          8 :     pos = std::strchr(text, '\n');
     572                 :            : 
     573         [ +  + ]:          8 :     if(pos == 0) {
     574                 :          4 :       pos = end;
     575                 :            :     }
     576                 :            : 
     577                 :          8 :     length = pos - text;
     578                 :            : 
     579         [ +  + ]:          8 :     if(length > max_length) {
     580                 :          4 :       char *s = 0;
     581                 :          4 :       length = 0;
     582         [ +  + ]:         76 :       for(char *p = text; p < pos; ++p) {
     583   [ +  +  -  + ]:         72 :         if(*p == ' ' || *p == '\n') {
     584                 :         13 :           s = p;
     585                 :            :         }
     586         [ +  + ]:         72 :         if(++length > max_length) {
     587         [ +  + ]:         37 :           if(s != 0) {
     588                 :         13 :             *s = '\n';
     589                 :         13 :             length = (p - s);
     590                 :         13 :             s = 0;
     591                 :            :           }
     592                 :            :         }
     593                 :            :       }
     594                 :            :     }
     595                 :            : 
     596                 :          8 :     text = pos + 1;
     597                 :            :   }
     598                 :          4 : }
     599                 :            : 
     600                 :            : /**
     601                 :            :  * Returns the offsets of the title and body of an HTML document.
     602                 :            :  */
     603                 :          3 : title_body_type html_title_and_body(const char *begin, const char *end) {
     604                 :            :   title_body_type result(static_cast<const char *>(0), 0,
     605                 :          3 :                          static_cast<const char *>(0), 0);
     606         [ +  - ]:          6 :   boost::cmatch match;
     607                 :            : 
     608   [ +  -  +  + ]:          3 :   if(boost::regex_search(begin, end, match, html_title_begin)) {
     609   [ +  -  +  - ]:          2 :     std::get<0>(result) = match[0].second;
     610   [ +  -  +  -  :          2 :     if(boost::regex_search(match[0].second, end, match, html_title_end)) {
                   +  - ]
     611   [ +  -  +  -  :          2 :       std::get<1>(result) = match[0].first - std::get<0>(result);;
                   +  - ]
     612                 :            :     }
     613                 :            :   }
     614                 :            : 
     615   [ +  -  +  -  :          6 :   if(boost::regex_search((std::get<1>(result) == 0 ? begin :
                   +  + ]
     616   [ +  -  +  - ]:          2 :                           std::get<0>(result) + std::get<1>(result)),
     617         [ +  + ]:          5 :                          end, match, html_body_begin))
     618                 :            :   {
     619   [ +  -  +  - ]:          2 :     std::get<2>(result) = match[0].second;
     620   [ +  -  +  -  :          2 :     if(boost::regex_search(std::get<2>(result), end, match, html_body_end)) {
                   +  - ]
     621   [ +  -  +  -  :          2 :       std::get<3>(result) = match[0].first - std::get<2>(result);
                   +  - ]
     622                 :            :     }
     623                 :            :   }
     624                 :            : 
     625                 :          3 :   return result;
     626                 :            : }
     627                 :            : 
     628   [ +  -  +  - ]:         12 : __END_NS_SSRC_WSPR_UTILITY