Main Page   Namespace List   Compound List   File List   Compound Members   File Members  

pcre++.cc

Go to the documentation of this file.
00001 /*
00002  *
00003  *  This file  is part of the PCRE++ Class Library.
00004  *
00005  *  By  accessing  this software,  PCRE++, you  are  duly informed
00006  *  of and agree to be  bound  by the  conditions  described below
00007  *  in this notice:
00008  *
00009  *  This software product,  PCRE++,  is developed by Thomas Linden
00010  *  and  copyrighted (C) 2002  by  Thomas Linden,  with all rights 
00011  *  reserved.
00012  *
00013  *  There  is no charge for PCRE++ software.  You can redistribute
00014  *  it and/or modify it under the terms of the GNU  Lesser General
00015  *  Public License, which is incorporated by reference herein.
00016  *
00017  *  PCRE++ is distributed WITHOUT ANY WARRANTY, IMPLIED OR EXPRESS,
00018  *  OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE or that
00019  *  the use of it will not infringe on any third party's intellec-
00020  *  tual property rights.
00021  *
00022  *  You should have received a copy of the GNU Lesser General Public
00023  *  License along with PCRE++.  Copies can also be obtained from:
00024  *
00025  *    http://www.gnu.org/licenses/lgpl.txt
00026  *
00027  *  or by writing to:
00028  *
00029  *  Free Software Foundation, Inc.
00030  *  59 Temple Place, Suite 330
00031  *  Boston, MA 02111-1307
00032  *  USA
00033  *
00034  *  Or contact:
00035  *
00036  *   "Thomas Linden" <tom@daemon.de>
00037  *
00038  *
00039  */
00040 
00041 
00042 #include "pcre++.h"
00043 
00044 
00045 /*
00046  * CONSTRUCTORS
00047  */
00048 Pcre::Pcre(const string& expression) {
00049   _expression   = expression;
00050   _flags        = 0;
00051   case_t = global_t = false;
00052   zero();
00053   Compile(0);
00054 }
00055 
00056 Pcre::Pcre(const string& expression, const string& flags) {
00057   _expression   = expression;
00058   unsigned int FLAG = 0;
00059 
00060   for(unsigned int flag=0; flag<flags.length(); flag++) {
00061     switch(flags[flag]) {
00062     case 'i': FLAG |= PCRE_CASELESS;  case_t = true;   break;
00063     case 'm': FLAG |= PCRE_MULTILINE;                  break;
00064     case 's': FLAG |= PCRE_DOTALL;                     break;
00065     case 'x': FLAG |= PCRE_EXTENDED;                   break;
00066     case 'g':                         global_t = true; break;
00067     }
00068   }
00069 
00070   _flags = FLAG;
00071 
00072   zero();
00073   Compile(FLAG);
00074 }
00075 
00076 Pcre::Pcre(const Pcre &P) {
00077   _expression = P._expression;
00078   _flags      = P._flags;
00079   case_t      = P.case_t;
00080   global_t    = P.global_t;
00081   zero();
00082   Compile(_flags);
00083 }
00084 
00085 Pcre::Pcre() {
00086   zero();
00087 }
00088 
00089 
00090 
00091 
00092 
00093 
00094 
00095 /*
00096  * Destructor
00097  */
00098 Pcre::~Pcre() {
00099   /* avoid deleting of uninitialized pointers */
00100   if (p_pcre != NULL) {
00101     pcre_free(p_pcre);
00102   }
00103   if (p_pcre_extra != NULL) {
00104     pcre_free(p_pcre_extra);
00105   }
00106   if(sub_vec != NULL) {
00107     delete[] sub_vec;
00108   }
00109   if(num_matches > 0) {
00110     delete resultset;
00111   }
00112 }
00113 
00114 
00115 
00116 
00117 /*
00118  * operator= definitions
00119  */
00120 const Pcre& Pcre::operator = (const string& expression) {
00121   /* reset the object and re-intialize it */
00122   reset();
00123   _expression = expression;
00124   _flags      = 0;
00125   case_t = global_t = false;
00126   Compile(0);
00127   return *this;
00128 }
00129 
00130 
00131 const Pcre& Pcre::operator = (const Pcre &P) {
00132   reset();
00133   _expression = P._expression;
00134   _flags      = P._flags;
00135   case_t      = P.case_t;
00136   global_t    = P.global_t;
00137   zero();
00138   Compile(_flags);
00139   return *this;
00140 }
00141 
00142 
00143 
00144 
00145 
00146 
00147 /*
00148  * mem resetting methods
00149  */
00150 void Pcre::zero() {
00151   /* what happens if p_pcre is already allocated? hm ... */
00152   p_pcre_extra = NULL;
00153   p_pcre       = NULL;
00154   sub_vec      = NULL;
00155   resultset    = NULL;
00156   err_str      = NULL;
00157   num_matches  = -1;
00158 }
00159 
00160 void Pcre::reset() {
00161   did_match   = false;
00162   num_matches = -1;
00163 }
00164 
00165 
00166 
00167 
00168 
00169 /*
00170  * compile the expression
00171  */
00172 void Pcre::Compile(int flags) {
00173   p_pcre       = pcre_compile((char *)_expression.c_str(), flags,
00174                               (const char **)(&err_str), &erroffset, NULL);
00175 
00176   if(p_pcre == NULL) {
00177     /* umh, that's odd, the parser should not fail at all */
00178     string Error = err_str;
00179     throw exception("pcre_compile(..) failed: " + Error);
00180   }
00181 
00182   /* calculate the number of substrings we are willing to catch */
00183   int where;
00184   int info = pcre_fullinfo( p_pcre, p_pcre_extra, PCRE_INFO_CAPTURECOUNT, &where);
00185   if(info == 0) {
00186     sub_len = (where +2) * 3; /* see "man pcre" for the exact formula */
00187   }
00188   else {
00189     throw exception(info);
00190   }
00191   reset();
00192 }
00193 
00194 
00195 
00196 
00197 /*
00198  * API methods
00199  */
00200 bool Pcre::search(const string& stuff, int OffSet) {
00201   return dosearch(stuff, OffSet);
00202 }
00203 
00204 bool Pcre::search(const string& stuff) {
00205   return dosearch(stuff, 0);
00206 }
00207 
00208 bool Pcre::dosearch(const string& stuff, int OffSet) {
00209   reset();
00210   if (sub_vec != NULL)
00211     delete sub_vec;
00212 
00213   sub_vec = new int[sub_len];
00214   int num = pcre_exec(p_pcre, p_pcre_extra, (char *)stuff.c_str(),
00215                         (int)stuff.length(), OffSet, 0, (int *)sub_vec, sub_len);
00216 
00217   if(num < 0) {
00218     /* no match at all */
00219     return false;
00220   }
00221   else if(num == 0) {
00222     /* vector too small, there were too many substrings in stuff */
00223     return false;
00224   }
00225   else if(num == 1) {
00226     /* we had a match, but without substrings */
00227     did_match = true;
00228     num_matches = 0;
00229     return true;
00230   }
00231   else if(num > 1) {
00232     /* we had matching substrings */
00233     if (resultset != NULL)
00234       delete resultset;
00235     resultset = new Array;
00236     const char **stringlist;
00237     did_match = true;
00238     num_matches = num - 1;
00239 
00240     int res = pcre_get_substring_list((char *)stuff.c_str(), sub_vec, num, &stringlist);
00241     if(res == 0) {
00242       for(int i=1; i<num; i++) {
00243         resultset->push_back(stringlist[i]);
00244       }
00245       pcre_free_substring_list(stringlist);
00246     }
00247     else {
00248       throw exception(res);
00249     }
00250     return true;
00251   }
00252   else {
00253     /* some other uncommon error occured */
00254     return false;
00255   }
00256 }
00257 
00258 Array* Pcre::get_sub_strings() {
00259   if(resultset != NULL)
00260     return resultset;
00261   else
00262     return NULL;
00263 }
00264 
00265 string Pcre::get_match(int pos) {
00266   if(pos >= 0 && pos < num_matches) {
00267     ArrayIterator P = resultset->begin() + pos;
00268     return *P;
00269   }
00270   else {
00271     throw exception("out of range");
00272   }
00273 }
00274 
00275 int Pcre::get_match_start() {
00276   if (sub_vec)
00277     return sub_vec[0];
00278   else
00279     return -1;
00280 }
00281 
00282 int Pcre::get_match_end() {
00283   if (sub_vec)
00284     return sub_vec[1] - 1;
00285   else
00286     return -1;
00287 }
00288 
00289 int Pcre::get_match_start(int pos) {
00290   if(pos >= 0 && pos <= num_matches) {
00291     /*
00292      * sub_vec[0] and [1] is the start/end of the entire string.
00293      */
00294     return sub_vec[ (++pos) * 2 ];
00295   }
00296   else {
00297     throw exception("out of range");
00298   }  
00299 }
00300 
00301 int Pcre::get_match_end(int pos) {
00302   if(pos >= 0 && pos <= num_matches) {
00303     /*
00304      * the end offset of a subpattern points to
00305      * the first offset of the next substring,
00306      * therefore -1
00307      */
00308     return sub_vec[ ((++pos) * 2) + 1 ] - 1;
00309   }
00310   else {
00311     throw exception("out of range");
00312   }
00313 }
00314 
00315 size_t Pcre::get_match_length(int pos) {
00316   if(pos >= 0 && pos < num_matches) {
00317     ArrayIterator P = resultset->begin() + pos;
00318     return P->length();
00319   }
00320   else {
00321     throw exception("out of range");
00322   }
00323 }
00324 
00325 Array Pcre::_split(const string& piece, int limit, int start_offset, int end_offset) {
00326   Array Splitted;
00327   /* _expression will be used as delimiter */
00328   if(_expression.length() == 1) {
00329     /* use the plain c++ way, ignore the pre-compiled p_pcre */
00330     string buffer, _delimiter, _piece;
00331     char z;
00332     if(case_t) {
00333       z = toupper(_expression[0]);
00334       for(size_t pos=0; pos < piece.length(); pos++) {
00335         _piece += (char)toupper(piece[pos]);
00336       }
00337     }
00338     else {
00339       z = _expression[0];
00340       _piece = piece;
00341     }
00342     for(size_t pos=0; pos<piece.length(); pos++) {
00343       if(_piece[pos] == z) {
00344         Splitted.push_back(buffer);
00345         buffer = "";
00346       }
00347       else {
00348         buffer += piece[pos];
00349       }
00350     }
00351     if(buffer != "") {
00352       Splitted.push_back(buffer);
00353     }
00354   }
00355   else {
00356     /* use the regex way */
00357     if(_expression[0] != '(' && _expression[ _expression.length() - 1 ] != ')' ) {
00358       /* oh, oh - the pre-compiled expression does not contain brackets */
00359       pcre_free(p_pcre);
00360       pcre_free(p_pcre_extra);
00361       
00362       pcre       *_p = NULL;
00363       pcre_extra *_e = NULL;;
00364 
00365       p_pcre = _p;
00366       p_pcre_extra = _e;
00367 
00368       _expression = "(" + _expression + ")";
00369       Compile(_flags);
00370     }
00371     int num_pieces=0, pos=0, piece_end = 0, piece_start = 0;
00372     for(;;) {
00373       if(search(piece, pos) == true) {
00374         if(matches() > 0) {
00375           piece_end   = get_match_start(0) - 1;
00376           piece_start = pos;
00377           pos = piece_end + 1 + get_match_length(0);
00378           string junk(piece, piece_start, (piece_end - piece_start)+1);
00379           num_pieces++;
00380           if( (limit != 0 && num_pieces < limit) || limit == 0) {
00381             if( (start_offset != 0 && num_pieces >= start_offset) || start_offset == 0) {
00382               if( (end_offset != 0 && num_pieces <= end_offset) || end_offset == 0) {
00383                 /* we are within the allowed range, so just add the grab */
00384                 Splitted.push_back(junk);
00385               }
00386             }
00387           }
00388         }
00389       }
00390       else {
00391         /* the rest of the string, there are no more delimiters */
00392         string junk(piece, pos, (piece.length() - pos));
00393         num_pieces++;
00394         if( (limit != 0 && num_pieces < limit) || limit == 0) {
00395           if( (start_offset != 0 && num_pieces >= start_offset) || start_offset == 0) {
00396             if( (end_offset != 0 && num_pieces <= end_offset) || end_offset == 0) {
00397               /* we are within the allowed range, so just add the grab */
00398               Splitted.push_back(junk);
00399             }
00400           }
00401         }
00402         break;
00403       }
00404     } // for()
00405   } // if(_expression.length()
00406   return Splitted;
00407 }
00408 
00409 Array Pcre::split(const string& piece) {
00410   return _split(piece, 0, 0, 0);
00411 }
00412 
00413 Array Pcre::split(const string& piece, int limit) {
00414   return _split(piece, limit, 0, 0);
00415 }
00416 
00417 Array Pcre::split(const string& piece, int limit, int start_offset) {
00418   return _split(piece, limit, start_offset, 0);
00419 }
00420 
00421 Array Pcre::split(const string& piece, int limit, int start_offset, int end_offset) {
00422   return _split(piece, limit, start_offset, end_offset);
00423 }
00424 
00425 Array Pcre::split(const string& piece, vector<int> positions) {
00426   Array PreSplitted = _split(piece, 0, 0, 0);
00427   Array Splitted;
00428   for(vector<int>::iterator vecIt=positions.begin(); vecIt != positions.end(); ++vecIt) {
00429     Splitted.push_back(PreSplitted[*vecIt]);
00430   }
00431   return Splitted;
00432 }
00433 
00434 
00435 
00436 string Pcre::replace(const string& piece, const string& with) {
00437   string Replaced(piece);
00438 
00439   /*
00440    * very first job: look, if the expression already contains
00441    * braces, if yes, do not add braces, else, do it
00442    */
00443   Pcre braces("[^\\\\]\\(.*[^\\\\]\\)"); // perlish: [^\\]\(.*[^\\]\)
00444   if(! braces.search(_expression)) {
00445     //  if(_expression[0] != '(' && _expression[ _expression.length() ] != ')' ) {
00446     /* oh, oh - the pre-compiled expression does not contain brackets */
00447 
00448     /* recreate the p_pcre* objects to avoid memory leaks */
00449     pcre_free(p_pcre);
00450     pcre_free(p_pcre_extra);
00451       
00452     pcre       *_p = NULL;
00453     pcre_extra *_e = NULL;;
00454 
00455     p_pcre = _p;
00456     p_pcre_extra = _e;
00457 
00458     _expression = "(" + _expression + ")";
00459     Compile(_flags);
00460   }
00461 
00462   if(search(piece)) {
00463     /* we found at least one match */
00464     string use_with = _replace_vars(with);
00465     if(!global_t) {
00466       /*
00467        * only once, use the entire match
00468        * Patch submitted by Mark Carrington <mark@mutantpenguin.co.uk>
00469        */
00470       if(matched() && matches() >= 1) {
00471         int len = get_match_end() - get_match_start() + 1;
00472         Replaced.replace(get_match_start(0), len, use_with);
00473       }
00474     }
00475     else {
00476       /*
00477        * global replace.
00478        *
00479        * We need to keep checking the line after it is modified to see the next match.
00480        * Especially \s is something of a bitch as it can be a newline, return carriage,
00481        * space, tab, etc ... so we have to keep  searching for the next type.
00482        * Patch submitted by Jim Hull <imaginos@imaginos.net>
00483        */
00484       string sLeftOver = Replaced;
00485       int iCurPosition = 0;
00486       while( search( sLeftOver ) ) {
00487         if( matched() && matches() >= 1 ) {
00488           int len = 0;
00489           string lookfor;
00490           lookfor.erase();
00491           int match_pos;
00492           for (match_pos = 0; match_pos < matches(); match_pos++) {
00493             len += ((get_match_end(match_pos) - get_match_start(match_pos)) + 1);
00494             lookfor += get_match(match_pos);
00495           }
00496           match_pos = Replaced.find( lookfor, iCurPosition );
00497           Replaced.replace(match_pos, len, use_with);
00498           iCurPosition = ( match_pos + use_with.length() );
00499           sLeftOver = Replaced.substr( iCurPosition, string::npos );
00500         }
00501       }
00502     }
00503   }
00504   return Replaced;
00505 }
00506 
00507 
00508 
00509 string Pcre::_replace_vars(const string& piece) {
00510   Pcre dollar("\\$[0-9]+");
00511   string with = piece;
00512   if(dollar.search(with)) {
00513     for(int index=0; index < num_matches; index++) {
00514       /* do it for each existing sub string */
00515       string sub   = get_match(index); // what "$1" resulted
00516       ostringstream num;
00517       num << index+1;
00518       string dollar_num = "(\\$" + num.str() + ")";
00519       Pcre subsplit(dollar_num); // "\\$1"
00520       // normally 2 (or more) parts, the one in front of and the other one after "$1"
00521       Array splitted = subsplit.split(with); 
00522       string Replaced;
00523       for(size_t pos=0; pos < splitted.size(); pos++) {
00524         if(pos == (splitted.size() - 1))
00525           Replaced += splitted[pos];
00526         else
00527           Replaced += splitted[pos] + sub;
00528       }
00529       with = Replaced; // well, one part is done
00530     }
00531     return with;
00532   }
00533   else {
00534     /* hm, no $[0-9]+ stuff, so just return it untouched */
00535     return with;
00536   }
00537 }

Generated on Sun Feb 23 17:57:14 2003 for PCRE++ by doxygen1.3-rc3