Main Page   Namespace List   Compound List   File List   Compound Members   File Members  

pcre++.cc

Go to the documentation of this file.
00001 /*
00002  *
00003  *   $Id: pcre++.cc,v 1.2 2002/01/02 01:25:30 zarahg Exp $
00004  *
00005  *  This file  is part of the PCRE++ Class Library.
00006  *
00007  *  By  accessing  this software,  PCRE++, you  are  duly informed
00008  *  of and agree to be  bound  by the  conditions  described below
00009  *  in this notice:
00010  *
00011  *  This software product,  PCRE++,  is developed by Thomas Linden
00012  *  and  copyrighted (C) 2002  by  Thomas Linden,  with all rights 
00013  *  reserved.
00014  *
00015  *  There  is no charge for PCRE++ software.  You can redistribute
00016  *  it and/or modify it under the terms of the GNU  Lesser General
00017  *  Public License, which is incorporated by reference herein.
00018  *
00019  *  PCRE++ is distributed WITHOUT ANY WARRANTY, IMPLIED OR EXPRESS,
00020  *  OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE or that
00021  *  the use of it will not infringe on any third party's intellec-
00022  *  tual property rights.
00023  *
00024  *  You should have received a copy of the GNU Lesser General Public
00025  *  License along with PCRE++.  Copies can also be obtained from:
00026  *
00027  *    http://www.gnu.org/licenses/lgpl.txt
00028  *
00029  *  or by writing to:
00030  *
00031  *  Free Software Foundation, Inc.
00032  *  59 Temple Place, Suite 330
00033  *  Boston, MA 02111-1307
00034  *  USA
00035  *
00036  *  Or contact:
00037  *
00038  *   "Thomas Linden" <tom@daemon.de>
00039  *
00040  *
00041  */
00042 
00043 
00044 #include "pcre++.h"
00045 
00046 
00047 /*
00048  * CONSTRUCTORS
00049  */
00050 Pcre::Pcre(const string& expression) {
00051   _expression   = expression;
00052   _flags        = 0;
00053   case_t = global_t = false;
00054   zero();
00055   Compile(0);
00056 }
00057 
00058 Pcre::Pcre(const string& expression, const string& flags) {
00059   _expression   = expression;
00060   unsigned int FLAG = 0;
00061 
00062   for(unsigned int flag=0; flag<flags.length(); flag++) {
00063     switch(flags[flag]) {
00064     case 'i': FLAG |= PCRE_CASELESS;  case_t = true;   break;
00065     case 'm': FLAG |= PCRE_MULTILINE;                  break;
00066     case 's': FLAG |= PCRE_DOTALL;                     break;
00067     case 'x': FLAG |= PCRE_EXTENDED;                   break;
00068     case 'g':                         global_t = true; break;
00069     }
00070   }
00071 
00072   _flags = FLAG;
00073 
00074   zero();
00075   Compile(FLAG);
00076 }
00077 
00078 Pcre::Pcre(const Pcre &P) {
00079   _expression = P._expression;
00080   _flags      = P._flags;
00081   case_t      = P.case_t;
00082   global_t    = P.global_t;
00083   zero();
00084   Compile(_flags);
00085 }
00086 
00087 Pcre::Pcre() {
00088   zero();
00089 }
00090 
00091 
00092 
00093 
00094 
00095 
00096 
00097 /*
00098  * Destructor
00099  */
00100 Pcre::~Pcre() {
00101   /* avoid deleting of uninitialized pointers */
00102   if (p_pcre != NULL) {
00103     pcre_free(p_pcre);
00104   }
00105   if (p_pcre_extra != NULL) {
00106     pcre_free(p_pcre_extra);
00107   }
00108   if(sub_vec != NULL) {
00109     delete[] sub_vec;
00110   }
00111   if(num_matches > 0) {
00112     delete resultset;
00113   }
00114   if(err_str != NULL) {
00115     delete err_str;
00116   }
00117 }
00118 
00119 
00120 
00121 
00122 /*
00123  * operator= definitions
00124  */
00125 const Pcre& Pcre::operator = (const string& expression) {
00126   /* reset the object and re-intialize it */
00127   reset();
00128   _expression = expression;
00129   _flags      = 0;
00130   case_t = global_t = false;
00131   Compile(0);
00132   return *this;
00133 }
00134 
00135 
00136 const Pcre& Pcre::operator = (const Pcre &P) {
00137   reset();
00138   _expression = P._expression;
00139   _flags      = P._flags;
00140   case_t      = P.case_t;
00141   global_t    = P.global_t;
00142   zero();
00143   Compile(_flags);
00144   return *this;
00145 }
00146 
00147 
00148 
00149 
00150 
00151 
00152 /*
00153  * mem resetting methods
00154  */
00155 void Pcre::zero() {
00156   /* what happens if p_pcre is already allocated? hm ... */
00157   p_pcre_extra = NULL;
00158   p_pcre       = NULL;
00159   sub_vec      = NULL;
00160   resultset    = NULL;
00161   err_str      = NULL;
00162   num_matches  = -1;
00163 }
00164 
00165 void Pcre::reset() {
00166   did_match   = false;
00167   num_matches = -1;
00168 }
00169 
00170 
00171 
00172 
00173 
00174 /*
00175  * compile the expression
00176  */
00177 void Pcre::Compile(int flags) {
00178   p_pcre       = pcre_compile((char *)_expression.c_str(), flags,
00179                               (const char **)(&err_str), &erroffset, NULL);
00180 
00181   if(p_pcre == NULL) {
00182     /* umh, that's odd, the parser should not fail at all */
00183     string Error = err_str;
00184     throw exception("pcre_compile(..) failed: " + Error);
00185   }
00186 
00187   /* calculate the number of substrings we are willing to catch */
00188   int where;
00189   int info = pcre_fullinfo( p_pcre, p_pcre_extra, PCRE_INFO_CAPTURECOUNT, &where);
00190   if(info == 0) {
00191     sub_len = (where +2) * 3; /* see "man pcre" for the exact formula */
00192   }
00193   else {
00194     throw exception(info);
00195   }
00196   reset();
00197 }
00198 
00199 
00200 
00201 
00202 /*
00203  * API methods
00204  */
00205 bool Pcre::search(const string& stuff, int OffSet) {
00206   return dosearch(stuff, OffSet);
00207 }
00208 
00209 bool Pcre::search(const string& stuff) {
00210   return dosearch(stuff, 0);
00211 }
00212 
00213 bool Pcre::dosearch(const string& stuff, int OffSet) {
00214   reset();
00215   if (sub_vec != NULL)
00216     delete sub_vec;
00217 
00218   sub_vec = new int[sub_len];
00219   int num = pcre_exec(p_pcre, p_pcre_extra, (char *)stuff.c_str(),
00220                         (int)stuff.length(), OffSet, 0, (int *)sub_vec, sub_len);
00221 
00222   if(num < 0) {
00223     /* no match at all */
00224     return false;
00225   }
00226   else if(num == 0) {
00227     /* vector too small, there were too many substrings in stuff */
00228     return false;
00229   }
00230   else if(num == 1) {
00231     /* we had a match, but without substrings */
00232     did_match = true;
00233     num_matches = 0;
00234     return true;
00235   }
00236   else if(num > 1) {
00237     /* we had matching substrings */
00238     if (resultset != NULL)
00239       delete resultset;
00240     resultset = new Array;
00241     const char **stringlist;
00242     did_match = true;
00243     num_matches = num - 1;
00244 
00245     int res = pcre_get_substring_list((char *)stuff.c_str(), sub_vec, num, &stringlist);
00246     if(res == 0) {
00247       for(int i=1; i<num; i++) {
00248         resultset->push_back(stringlist[i]);
00249       }
00250       pcre_free_substring_list(stringlist);
00251     }
00252     else {
00253       throw exception(res);
00254     }
00255     return true;
00256   }
00257   else {
00258     /* some other uncommon error occured */
00259     return false;
00260   }
00261 }
00262 
00263 Array* Pcre::get_sub_strings() {
00264   if(resultset != NULL)
00265     return resultset;
00266   else
00267     return NULL;
00268 }
00269 
00270 string Pcre::get_match(int pos) {
00271   if(pos >= 0 && pos < num_matches) {
00272     ArrayIterator P = resultset->begin() + pos;
00273     return *P;
00274   }
00275   else {
00276     throw exception("out of range");
00277   }
00278 }
00279 
00280 int Pcre::get_match_start() {
00281   if (sub_vec)
00282     return sub_vec[0];
00283   else
00284     return -1;
00285 }
00286 
00287 int Pcre::get_match_end() {
00288   if (sub_vec)
00289     return sub_vec[1] - 1;
00290   else
00291     return -1;
00292 }
00293 
00294 int Pcre::get_match_start(int pos) {
00295   if(pos >= 0 && pos <= num_matches) {
00296     /*
00297      * sub_vec[0] and [1] is the start/end of the entire string.
00298      */
00299     return sub_vec[ (++pos) * 2 ];
00300   }
00301   else {
00302     throw exception("out of range");
00303   }  
00304 }
00305 
00306 int Pcre::get_match_end(int pos) {
00307   if(pos >= 0 && pos <= num_matches) {
00308     /*
00309      * the end offset of a subpattern points to
00310      * the first offset of the next substring,
00311      * therefore -1
00312      */
00313     return sub_vec[ ((++pos) * 2) + 1 ] - 1;
00314   }
00315   else {
00316     throw exception("out of range");
00317   }
00318 }
00319 
00320 size_t Pcre::get_match_length(int pos) {
00321   if(pos >= 0 && pos < num_matches) {
00322     ArrayIterator P = resultset->begin() + pos;
00323     return P->length();
00324   }
00325   else {
00326     throw exception("out of range");
00327   }
00328 }
00329 
00330 Array Pcre::_split(const string& piece, int limit, int start_offset, int end_offset) {
00331   Array Splitted;
00332   /* _expression will be used as delimiter */
00333   if(_expression.length() == 1) {
00334     /* use the plain c++ way, ignore the pre-compiled p_pcre */
00335     string buffer, _delimiter, _piece;
00336     char z;
00337     if(case_t) {
00338       z = toupper(_expression[0]);
00339       for(size_t pos=0; pos < piece.length(); pos++) {
00340         _piece += (char)toupper(piece[pos]);
00341       }
00342     }
00343     else {
00344       z = _expression[0];
00345       _piece = piece;
00346     }
00347     for(size_t pos=0; pos<piece.length(); pos++) {
00348       if(_piece[pos] == z) {
00349         Splitted.push_back(buffer);
00350         buffer = "";
00351       }
00352       else {
00353         buffer += piece[pos];
00354       }
00355     }
00356     if(buffer != "") {
00357       Splitted.push_back(buffer);
00358     }
00359   }
00360   else {
00361     /* use the regex way */
00362     if(_expression[0] != '(' && _expression[ _expression.length() ] != ')' ) {
00363       /* oh, oh - the pre-compiled expression does not contain brackets */
00364       pcre_free(p_pcre);
00365       pcre_free(p_pcre_extra);
00366       
00367       pcre       *_p = NULL;
00368       pcre_extra *_e = NULL;;
00369 
00370       p_pcre = _p;
00371       p_pcre_extra = _e;
00372 
00373       _expression = "(" + _expression + ")";
00374       Compile(_flags);
00375     }
00376     int num_pieces=0, pos=0, piece_end = 0, piece_start = 0;
00377     for(;;) {
00378       if(search(piece, pos) == true) {
00379         if(matches() > 0) {
00380           piece_end   = get_match_start(0) - 1;
00381           piece_start = pos;
00382           pos = piece_end + 1 + get_match_length(0);
00383           string junk(piece, piece_start, (piece_end - piece_start)+1);
00384           num_pieces++;
00385           if( (limit != 0 && num_pieces < limit) || limit == 0) {
00386             if( (start_offset != 0 && num_pieces >= start_offset) || start_offset == 0) {
00387               if( (end_offset != 0 && num_pieces <= end_offset) || end_offset == 0) {
00388                 /* we are within the allowed range, so just add the grab */
00389                 Splitted.push_back(junk);
00390               }
00391             }
00392           }
00393         }
00394       }
00395       else {
00396         /* the rest of the string, there are no more delimiters */
00397         string junk(piece, pos, (piece.length() - pos));
00398         num_pieces++;
00399         if( (limit != 0 && num_pieces < limit) || limit == 0) {
00400           if( (start_offset != 0 && num_pieces >= start_offset) || start_offset == 0) {
00401             if( (end_offset != 0 && num_pieces <= end_offset) || end_offset == 0) {
00402               /* we are within the allowed range, so just add the grab */
00403               Splitted.push_back(junk);
00404             }
00405           }
00406         }
00407         break;
00408       }
00409     } // for()
00410   } // if(_expression.length()
00411   return Splitted;
00412 }
00413 
00414 Array Pcre::split(const string& piece) {
00415   return _split(piece, 0, 0, 0);
00416 }
00417 
00418 Array Pcre::split(const string& piece, int limit) {
00419   return _split(piece, limit, 0, 0);
00420 }
00421 
00422 Array Pcre::split(const string& piece, int limit, int start_offset) {
00423   return _split(piece, limit, start_offset, 0);
00424 }
00425 
00426 Array Pcre::split(const string& piece, int limit, int start_offset, int end_offset) {
00427   return _split(piece, limit, start_offset, end_offset);
00428 }
00429 
00430 Array Pcre::split(const string& piece, vector<int> positions) {
00431   Array PreSplitted = _split(piece, 0, 0, 0);
00432   Array Splitted;
00433   for(vector<int>::iterator vecIt=positions.begin(); vecIt != positions.end(); ++vecIt) {
00434     Splitted.push_back(PreSplitted[*vecIt]);
00435   }
00436   return Splitted;
00437 }
00438 
00439 
00440 
00441 string Pcre::replace(const string& piece, const string& with) {
00442   string Replaced(piece);
00443 
00444   /*
00445    * very first job: look, if the expression already contains
00446    * braces, if yes, do not add braces, else, do it
00447    */
00448   Pcre braces("[^\\\\]\\(.*[^\\\\]\\)"); // perlish: [^\\]\(.*[^\\]\)
00449   if(! braces.search(_expression)) {
00450     //  if(_expression[0] != '(' && _expression[ _expression.length() ] != ')' ) {
00451     /* oh, oh - the pre-compiled expression does not contain brackets */
00452 
00453     /* recreate the p_pcre* objects to avoid memory leaks */
00454     pcre_free(p_pcre);
00455     pcre_free(p_pcre_extra);
00456       
00457     pcre       *_p = NULL;
00458     pcre_extra *_e = NULL;;
00459 
00460     p_pcre = _p;
00461     p_pcre_extra = _e;
00462 
00463     _expression = "(" + _expression + ")";
00464     Compile(_flags);
00465   }
00466 
00467   if(search(piece)) {
00468     /* we found at least one match */
00469     string use_with = _replace_vars(with);
00470     if(!global_t) {
00471       /*
00472        * only once, use the entire match
00473        * Patch submitted by Mark Carrington <mark@mutantpenguin.co.uk>
00474        */
00475       if(matched() && matches() >= 1) {
00476         int len = get_match_end() - get_match_start() + 1;
00477         Replaced.replace(get_match_start(0), len, use_with);
00478       }
00479     }
00480     else {
00481       /*
00482        * global replace.
00483        *
00484        * We need to keep checking the line after it is modified to see the next match.
00485        * Especially \s is something of a bitch as it can be a newline, return carriage,
00486        * space, tab, etc ... so we have to keep  searching for the next type.
00487        * Patch submitted by Jim Hull <imaginos@imaginos.net>
00488        */
00489       string sLeftOver = Replaced;
00490       int iCurPosition = 0;
00491       while( search( sLeftOver ) ) {
00492         if( matched() && matches() >= 1 ) {
00493           int len = 0;
00494           string lookfor;
00495           lookfor.erase();
00496           int match_pos;
00497           for (match_pos = 0; match_pos < matches(); match_pos++) {
00498             len += ((get_match_end(match_pos) - get_match_start(match_pos)) + 1);
00499             lookfor += get_match(match_pos);
00500           }
00501           match_pos = Replaced.find( lookfor, iCurPosition );
00502           Replaced.replace(match_pos, len, use_with);
00503           iCurPosition = ( match_pos + use_with.length() );
00504           sLeftOver = Replaced.substr( iCurPosition, string::npos );
00505         }
00506       }
00507     }
00508   }
00509   return Replaced;
00510 }
00511 
00512 
00513 
00514 string Pcre::_replace_vars(const string& piece) {
00515   Pcre dollar("\\$[0-9]+");
00516   string with = piece;
00517   if(dollar.search(with)) {
00518     for(int index=0; index < num_matches; index++) {
00519       /* do it for each existing sub string */
00520       string sub   = get_match(index); // what "$1" resulted
00521       ostringstream num;
00522       num << index+1;
00523       string dollar_num = "(\\$" + num.str() + ")";
00524       Pcre subsplit(dollar_num); // "\\$1"
00525       // normally 2 (or more) parts, the one in front of and the other one after "$1"
00526       Array splitted = subsplit.split(with); 
00527       string Replaced;
00528       for(size_t pos=0; pos < splitted.size(); pos++) {
00529         if(pos == (splitted.size() - 1))
00530           Replaced += splitted[pos];
00531         else
00532           Replaced += splitted[pos] + sub;
00533       }
00534       with = Replaced; // well, one part is done
00535     }
00536     return with;
00537   }
00538   else {
00539     /* hm, no $[0-9]+ stuff, so just return it untouched */
00540     return with;
00541   }
00542 }

Generated on Mon Jul 22 22:27:34 2002 for PCRE++ by doxygen1.2.13.1 written by Dimitri van Heesch, © 1997-2001