Main Page   Namespace List   Compound List   File List   Compound Members   File Members  

pcre++.cc

Go to the documentation of this file.
00001 /*
00002  *
00003  *   $Id: pcre++.cc,v 1.2 2002/01/02 01:25:30 zarahg Exp $
00004  *
00005  *  This file  is part of the PCRE++ Class Library.
00006  *
00007  *  By  accessing  this software,  PCRE++, you  are  duly informed
00008  *  of and agree to be  bound  by the  conditions  described below
00009  *  in this notice:
00010  *
00011  *  This software product,  PCRE++,  is developed by Thomas Linden
00012  *  and  copyrighted (C) 2002  by  Thomas Linden,  with all rights 
00013  *  reserved.
00014  *
00015  *  There  is no charge for PCRE++ software.  You can redistribute
00016  *  it and/or modify it under the terms of the GNU  Lesser General
00017  *  Public License, which is incorporated by reference herein.
00018  *
00019  *  PCRE++ is distributed WITHOUT ANY WARRANTY, IMPLIED OR EXPRESS,
00020  *  OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE or that
00021  *  the use of it will not infringe on any third party's intellec-
00022  *  tual property rights.
00023  *
00024  *  You should have received a copy of the GNU Lesser General Public
00025  *  License along with PCRE++.  Copies can also be obtained from:
00026  *
00027  *    http://www.gnu.org/licenses/lgpl.txt
00028  *
00029  *  or by writing to:
00030  *
00031  *  Free Software Foundation, Inc.
00032  *  59 Temple Place, Suite 330
00033  *  Boston, MA 02111-1307
00034  *  USA
00035  *
00036  *  Or contact:
00037  *
00038  *   "Thomas Linden" <tom@daemon.de>
00039  *
00040  *
00041  */
00042 
00043 
00044 #include "pcre++.h"
00045 
00046 
00047 /*
00048  * CONSTRUCTORS
00049  */
00050 Pcre::Pcre(const string& expression) {
00051   _expression   = expression;
00052   _flags        = 0;
00053   case_t = global_t = false;
00054   zero();
00055   Compile(0);
00056 }
00057 
00058 Pcre::Pcre(const string& expression, const string& flags) {
00059   _expression   = expression;
00060   unsigned int FLAG = 0;
00061 
00062   for(unsigned int flag=0; flag<flags.length(); flag++) {
00063     switch(flags[flag]) {
00064     case 'i': FLAG |= PCRE_CASELESS;  case_t = true;   break;
00065     case 'm': FLAG |= PCRE_MULTILINE;                  break;
00066     case 's': FLAG |= PCRE_DOTALL;                     break;
00067     case 'x': FLAG |= PCRE_EXTENDED;                   break;
00068     case 'g':                         global_t = true; break;
00069     }
00070   }
00071 
00072   _flags = FLAG;
00073 
00074   zero();
00075   Compile(FLAG);
00076 }
00077 
00078 Pcre::Pcre(const Pcre &P) {
00079   _expression = P._expression;
00080   _flags      = P._flags;
00081   case_t      = P.case_t;
00082   global_t    = P.global_t;
00083   zero();
00084   Compile(_flags);
00085 }
00086 
00087 Pcre::Pcre() {
00088   zero();
00089 }
00090 
00091 
00092 
00093 
00094 
00095 
00096 
00097 /*
00098  * Destructor
00099  */
00100 Pcre::~Pcre() {
00101   /* avoid deleting of uninitialized pointers */
00102   if (p_pcre != NULL) {
00103     pcre_free(p_pcre);
00104   }
00105   if (p_pcre_extra != NULL) {
00106     pcre_free(p_pcre_extra);
00107   }
00108   if(sub_vec != NULL) {
00109     delete[] sub_vec;
00110   }
00111   if(num_matches > 0) {
00112     delete resultset;
00113   }
00114   if(err_str != NULL) {
00115     delete err_str;
00116   }
00117 }
00118 
00119 
00120 
00121 
00122 /*
00123  * operator= definitions
00124  */
00125 const Pcre& Pcre::operator = (const string& expression) {
00126   /* reset the object and re-intialize it */
00127   reset();
00128   _expression = expression;
00129   _flags      = 0;
00130   case_t = global_t = false;
00131   Compile(0);
00132   return *this;
00133 }
00134 
00135 
00136 const Pcre& Pcre::operator = (const Pcre &P) {
00137   reset();
00138   _expression = P._expression;
00139   _flags      = P._flags;
00140   case_t      = P.case_t;
00141   global_t    = P.global_t;
00142   zero();
00143   Compile(_flags);
00144   return *this;
00145 }
00146 
00147 
00148 
00149 
00150 
00151 
00152 /*
00153  * mem resetting methods
00154  */
00155 void Pcre::zero() {
00156   /* what happens if p_pcre is already allocated? hm ... */
00157   p_pcre_extra = NULL;
00158   p_pcre       = NULL;
00159   sub_vec      = NULL;
00160   resultset    = NULL;
00161   err_str      = NULL;
00162   num_matches  = -1;
00163 }
00164 
00165 void Pcre::reset() {
00166   did_match   = false;
00167   num_matches = -1;
00168 }
00169 
00170 
00171 
00172 
00173 
00174 /*
00175  * compile the expression
00176  */
00177 void Pcre::Compile(int flags) {
00178   p_pcre       = pcre_compile((char *)_expression.c_str(), flags,
00179                               (const char **)(&err_str), &erroffset, NULL);
00180 
00181   if(p_pcre == NULL) {
00182     /* umh, that's odd, the parser should not fail at all */
00183     string Error = err_str;
00184     throw exception("pcre_compile(..) failed: " + Error);
00185   }
00186 
00187   /* calculate the number of substrings we are willing to catch */
00188   int where;
00189   int info = pcre_fullinfo( p_pcre, p_pcre_extra, PCRE_INFO_CAPTURECOUNT, &where);
00190   if(info == 0) {
00191     sub_len = (where +2) * 3; /* see "man pcre" for the exact formula */
00192   }
00193   else {
00194     throw exception(info);
00195   }
00196   reset();
00197 }
00198 
00199 
00200 
00201 
00202 /*
00203  * API methods
00204  */
00205 bool Pcre::search(const string& stuff, int OffSet) {
00206   return dosearch(stuff, OffSet);
00207 }
00208 
00209 bool Pcre::search(const string& stuff) {
00210   return dosearch(stuff, 0);
00211 }
00212 
00213 bool Pcre::dosearch(const string& stuff, int OffSet) {
00214   reset();
00215   sub_vec = new int[sub_len];
00216   int num = pcre_exec(p_pcre, p_pcre_extra, (char *)stuff.c_str(),
00217                         (int)stuff.length(), OffSet, 0, (int *)sub_vec, sub_len);
00218 
00219   if(num < 0) {
00220     /* no match at all */
00221     return false;
00222   }
00223   else if(num == 0) {
00224     /* vector too small, there were too many substrings in stuff */
00225     return false;
00226   }
00227   else if(num == 1) {
00228     /* we had a match, but without substrings */
00229     did_match = true;
00230     num_matches = 0;
00231     return true;
00232   }
00233   else if(num > 1) {
00234     /* we had matching substrings */
00235     resultset = new Array;
00236     const char **stringlist;
00237     did_match = true;
00238     num_matches = num - 1;
00239 
00240     int res = pcre_get_substring_list((char *)stuff.c_str(), sub_vec, num, &stringlist);
00241     if(res == 0) {
00242       for(int i=1; i<num; i++) {
00243         resultset->push_back(stringlist[i]);
00244       }
00245       pcre_free_substring_list(stringlist);
00246     }
00247     else {
00248       throw exception(res);
00249     }
00250     return true;
00251   }
00252   else {
00253     /* some other uncommon error occured */
00254     return false;
00255   }
00256 }
00257 
00258 Array* Pcre::get_sub_strings() {
00259   if(resultset != NULL)
00260     return resultset;
00261   else
00262     return NULL;
00263 }
00264 
00265 string Pcre::get_match(int pos) {
00266   if(pos >= 0 && pos < num_matches) {
00267     ArrayIterator P = resultset->begin() + pos;
00268     return *P;
00269   }
00270   else {
00271     throw exception("out of range");
00272   }
00273 }
00274 
00275 int Pcre::get_match_start(int pos) {
00276   if(pos >= 0 && pos <= num_matches) {
00277     /*
00278      * sub_vec[0] and [1] is the start/end of the entire string.
00279      */
00280     return sub_vec[ (++pos) * 2 ];
00281   }
00282   else {
00283     throw exception("out of range");
00284   }  
00285 }
00286 
00287 int Pcre::get_match_end(int pos) {
00288   if(pos >= 0 && pos <= num_matches) {
00289     /*
00290      * the end offset of a subpattern points to
00291      * the first offset of the next substring,
00292      * therefore -1
00293      */
00294     return sub_vec[ ((++pos) * 2) + 1 ] - 1;
00295   }
00296   else {
00297     throw exception("out of range");
00298   }
00299 }
00300 
00301 size_t Pcre::get_match_length(int pos) {
00302   if(pos >= 0 && pos < num_matches) {
00303     ArrayIterator P = resultset->begin() + pos;
00304     return P->length();
00305   }
00306   else {
00307     throw exception("out of range");
00308   }
00309 }
00310 
00311 Array Pcre::_split(const string& piece, int limit, int start_offset, int end_offset) {
00312   Array Splitted;
00313   /* _expression will be used as delimiter */
00314   if(_expression.length() == 1) {
00315     /* use the plain c++ way, ignore the pre-compiled p_pcre */
00316     string buffer, _delimiter, _piece;
00317     char z;
00318     if(case_t) {
00319       z = toupper(_expression[0]);
00320       for(size_t pos=0; pos < piece.length(); pos++) {
00321         _piece += (char)toupper(piece[pos]);
00322       }
00323     }
00324     else {
00325       z = _expression[0];
00326       _piece = piece;
00327     }
00328     for(size_t pos=0; pos<piece.length(); pos++) {
00329       if(_piece[pos] == z) {
00330         Splitted.push_back(buffer);
00331         buffer = "";
00332       }
00333       else {
00334         buffer += piece[pos];
00335       }
00336     }
00337     if(buffer != "") {
00338       Splitted.push_back(buffer);
00339     }
00340   }
00341   else {
00342     /* use the regex way */
00343     if(_expression[0] != '(' && _expression[ _expression.length() ] != ')' ) {
00344       /* oh, oh - the pre-compiled expression does not contain brackets */
00345       pcre_free(p_pcre);
00346       pcre_free(p_pcre_extra);
00347       
00348       pcre       *_p = NULL;
00349       pcre_extra *_e = NULL;;
00350 
00351       p_pcre = _p;
00352       p_pcre_extra = _e;
00353 
00354       _expression = "(" + _expression + ")";
00355       Compile(_flags);
00356     }
00357     int num_pieces=0, pos=0, piece_end = 0, piece_start = 0;
00358     for(;;) {
00359       if(search(piece, pos) == true) {
00360         if(matches() > 0) {
00361           piece_end   = get_match_start(0) - 1;
00362           piece_start = pos;
00363           pos = piece_end + 1 + get_match_length(0);
00364           string junk(piece, piece_start, (piece_end - piece_start)+1);
00365           num_pieces++;
00366           if( (limit != 0 && num_pieces < limit) || limit == 0) {
00367             if( (start_offset != 0 && num_pieces >= start_offset) || start_offset == 0) {
00368               if( (end_offset != 0 && num_pieces <= end_offset) || end_offset == 0) {
00369                 /* we are within the allowed range, so just add the grab */
00370                 Splitted.push_back(junk);
00371               }
00372             }
00373           }
00374         }
00375       }
00376       else {
00377         /* the rest of the string, there are no more delimiters */
00378         string junk(piece, pos, (piece.length() - pos));
00379         num_pieces++;
00380         if( (limit != 0 && num_pieces < limit) || limit == 0) {
00381           if( (start_offset != 0 && num_pieces >= start_offset) || start_offset == 0) {
00382             if( (end_offset != 0 && num_pieces <= end_offset) || end_offset == 0) {
00383               /* we are within the allowed range, so just add the grab */
00384               Splitted.push_back(junk);
00385             }
00386           }
00387         }
00388         break;
00389       }
00390     } // for()
00391   } // if(_expression.length()
00392   return Splitted;
00393 }
00394 
00395 Array Pcre::split(const string& piece) {
00396   return _split(piece, 0, 0, 0);
00397 }
00398 
00399 Array Pcre::split(const string& piece, int limit) {
00400   return _split(piece, limit, 0, 0);
00401 }
00402 
00403 Array Pcre::split(const string& piece, int limit, int start_offset) {
00404   return _split(piece, limit, start_offset, 0);
00405 }
00406 
00407 Array Pcre::split(const string& piece, int limit, int start_offset, int end_offset) {
00408   return _split(piece, limit, start_offset, end_offset);
00409 }
00410 
00411 Array Pcre::split(const string& piece, vector<int> positions) {
00412   Array PreSplitted = _split(piece, 0, 0, 0);
00413   Array Splitted;
00414   for(vector<int>::iterator vecIt=positions.begin(); vecIt != positions.end(); ++vecIt) {
00415     Splitted.push_back(PreSplitted[*vecIt]);
00416   }
00417   return Splitted;
00418 }
00419 
00420 
00421 
00422 string Pcre::replace(const string& piece, const string& with) {
00423   string Replaced(piece);
00424 
00425   /*
00426    * very first job: look, if the expression already contains
00427    * braces, if yes, do not add braces, else, do it
00428    */
00429   Pcre braces("[^\\\\]\\(.*[^\\\\]\\)"); // perlish: [^\\]\(.*[^\\]\)
00430   if(! braces.search(_expression)) {
00431     //  if(_expression[0] != '(' && _expression[ _expression.length() ] != ')' ) {
00432     /* oh, oh - the pre-compiled expression does not contain brackets */
00433 
00434     /* recreate the p_pcre* objects to avoid memory leaks */
00435     pcre_free(p_pcre);
00436     pcre_free(p_pcre_extra);
00437       
00438     pcre       *_p = NULL;
00439     pcre_extra *_e = NULL;;
00440 
00441     p_pcre = _p;
00442     p_pcre_extra = _e;
00443 
00444     _expression = "(" + _expression + ")";
00445     Compile(_flags);
00446   }
00447 
00448   if(search(piece)) {
00449     /* we found at least one match */
00450     string use_with = _replace_vars(with);
00451     if(!global_t) {
00452       /* only once */
00453       if(matched() && matches() >= 1) {
00454         int len = 0;
00455         if (matches() >= 1) {
00456           for (int match_pos = 0; match_pos < matches(); match_pos++) {
00457             // cout << "match[" << match_pos << "]: " << get_match(match_pos) << endl;
00458             len += ((get_match_end(match_pos) - get_match_start(match_pos)) + 1);
00459           }
00460         }
00461         Replaced.replace(get_match_start(0), len, use_with);
00462       }
00463     }
00464     else {
00465       /* global replace */
00466       if(matched() && matches() >= 1) {
00467         int len = 0;
00468         string lookfor;
00469         lookfor.erase();
00470         int match_pos;
00471         for (match_pos = 0; match_pos < matches(); match_pos++) {
00472           // cout << "match[" << match_pos << "]: " << get_match(match_pos) << endl;
00473           len += ((get_match_end(match_pos) - get_match_start(match_pos)) + 1);
00474           lookfor += get_match(match_pos);
00475         }
00476         match_pos = Replaced.find(lookfor);
00477         while((size_t)match_pos != string::npos) {
00478           Replaced.replace(match_pos, len, use_with);
00479           match_pos = Replaced.find(lookfor);
00480         }
00481       }
00482     }
00483   }
00484   return Replaced;
00485 }
00486 
00487 
00488 
00489 string Pcre::_replace_vars(const string& piece) {
00490   Pcre dollar("\\$[0-9]+");
00491   string with = piece;
00492   if(dollar.search(with)) {
00493     for(int index=0; index < num_matches; index++) {
00494       /* do it for each existing sub string */
00495       string sub   = get_match(index); // what "$1" resulted
00496       ostringstream num;
00497       num << index+1;
00498       string dollar_num = "(\\$" + num.str() + ")";
00499       Pcre subsplit(dollar_num); // "\\$1"
00500       // normally 2 (or more) parts, the one in front of and the other one after "$1"
00501       Array splitted = subsplit.split(with); 
00502       string Replaced;
00503       for(size_t pos=0; pos < splitted.size(); pos++) {
00504         if(pos == (splitted.size() - 1))
00505           Replaced += splitted[pos];
00506         else
00507           Replaced += splitted[pos] + sub;
00508       }
00509       with = Replaced; // well, one part is done
00510     }
00511     return with;
00512   }
00513   else {
00514     /* hm, no $[0-9]+ stuff, so just return it untouched */
00515     return with;
00516   }
00517 }

Generated on Thu May 2 00:31:43 2002 for PCRE++ by doxygen1.2.13.1 written by Dimitri van Heesch, © 1997-2001