Main Page   Compound List   File List   Compound Members   File Members  

pcre++.cc

Go to the documentation of this file.
00001 /*
00002  *
00003  *   $Id: pcre++.cc,v 1.2 2002/01/02 01:25:30 zarahg Exp $
00004  *
00005  *  This file  is part of the PCRE++ Class Library.
00006  *
00007  *  By  accessing  this software,  PCRE++, you  are  duly informed
00008  *  of and agree to be  bound  by the  conditions  described below
00009  *  in this notice:
00010  *
00011  *  This software product,  PCRE++,  is developed by Thomas Linden
00012  *  and  copyrighted (C) 2002  by  Thomas Linden,  with all rights 
00013  *  reserved.
00014  *
00015  *  There  is no charge for PCRE++ software.  You can redistribute
00016  *  it and/or modify it under the terms of the GNU  Lesser General
00017  *  Public License, which is incorporated by reference herein.
00018  *
00019  *  PCRE++ is distributed WITHOUT ANY WARRANTY, IMPLIED OR EXPRESS,
00020  *  OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE or that
00021  *  the use of it will not infringe on any third party's intellec-
00022  *  tual property rights.
00023  *
00024  *  You should have received a copy of the GNU Lesser General Public
00025  *  License along with PCRE++.  Copies can also be obtained from:
00026  *
00027  *    http://www.gnu.org/licenses/lgpl.txt
00028  *
00029  *  or by writing to:
00030  *
00031  *  Free Software Foundation, Inc.
00032  *  59 Temple Place, Suite 330
00033  *  Boston, MA 02111-1307
00034  *  USA
00035  *
00036  *  Or contact:
00037  *
00038  *   "Thomas Linden" <tom@daemon.de>
00039  *
00040  *
00041  */
00042 
00043 
00044 #include "pcre++.h"
00045 
00046 Pcre::Pcre(const string& expression) {
00047   _expression   = expression;
00048   _flags        = 0;
00049   case_t = global_t = false;
00050   Compile(0);
00051 }
00052 
00053 Pcre::Pcre(const string& expression, const string& flags) {
00054   _expression   = expression;
00055   unsigned int FLAG = 0;
00056 
00057   for(unsigned int flag=0; flag<flags.length(); flag++) {
00058     switch(flags[flag]) {
00059     case 'i': FLAG |= PCRE_CASELESS;  case_t = true;   break;
00060     case 'm': FLAG |= PCRE_MULTILINE;                  break;
00061     case 's': FLAG |= PCRE_DOTALL;                     break;
00062     case 'x': FLAG |= PCRE_EXTENDED;                   break;
00063     case 'g':                         global_t = true; break;
00064     }
00065   }
00066 
00067   _flags = FLAG;
00068 
00069   Compile(FLAG);
00070 }
00071 
00072 Pcre::Pcre(Pcre &P) {
00073   _expression = P._expression;
00074   _flags      = P._flags;
00075   case_t      = P.case_t;
00076   global_t    = P.global_t;
00077   Compile(_flags);
00078 }
00079 
00080 void Pcre::Compile(int flags) {
00081   /* what happens if p_pcre is already allocated? hm ... */
00082   p_pcre_extra = NULL;
00083   p_pcre       = NULL;
00084   p_pcre       = pcre_compile((char *)_expression.c_str(), flags,
00085                               (const char **)(&err_str), &erroffset, NULL);
00086 
00087   if(p_pcre == NULL) {
00088     /* umh, that's odd, the parser should not fail at all */
00089     string Error = err_str;
00090     throw exception("pcre_compile(..) failed: " + Error);
00091   }
00092 
00093   /* calculate the number of substrings we are willing to catch */
00094   int where;
00095   int info = pcre_fullinfo( p_pcre, p_pcre_extra, PCRE_INFO_CAPTURECOUNT, &where);
00096   if(info == 0) {
00097     sub_len = (where +2) * 3; /* see "man pcre" for the exact formula */
00098   }
00099   else {
00100     throw exception(info);
00101   }
00102   did_match = false;
00103   num_matches = -1;
00104 }
00105 
00106 const Pcre& Pcre::operator = (const string& expression) {
00107   reset();
00108   Pcre *pcre = new Pcre(expression);
00109   return *pcre;
00110 }
00111 
00112 Pcre::~Pcre() {
00113   pcre_free(p_pcre);
00114   pcre_free(p_pcre_extra);
00115   delete sub_vec;
00116   if(num_matches > 0) /* avoid deleting of uninitialized pointer */
00117     delete resultset;
00118 }
00119 
00120 void Pcre::reset() {
00121   did_match   = false;
00122   num_matches = -1;
00123 }
00124 
00125 bool Pcre::search(const string& stuff, int OffSet) {
00126   return dosearch(stuff, OffSet);
00127 }
00128 
00129 bool Pcre::search(const string& stuff) {
00130   return dosearch(stuff, 0);
00131 }
00132 
00133 bool Pcre::dosearch(const string& stuff, int OffSet) {
00134   reset();
00135   sub_vec = new int[sub_len];
00136   int num = pcre_exec(p_pcre, p_pcre_extra, (char *)stuff.c_str(),
00137                         (int)stuff.length(), OffSet, 0, (int *)sub_vec, sub_len);
00138 
00139   if(num < 0) {
00140     /* no match at all */
00141     return false;
00142   }
00143   else if(num == 0) {
00144     /* vector too small, there were too many substrings in stuff */
00145     return false;
00146   }
00147   else if(num == 1) {
00148     /* we had a match, but without substrings */
00149     did_match = true;
00150     num_matches = 0;
00151     return true;
00152   }
00153   else if(num > 1) {
00154     /* we had matching substrings */
00155     resultset = new Array;
00156     const char **stringlist;
00157     did_match = true;
00158     num_matches = num - 1;
00159 
00160     int res = pcre_get_substring_list((char *)stuff.c_str(), sub_vec, num, &stringlist);
00161     if(res == 0) {
00162       for(int i=1; i<num; i++) {
00163         resultset->push_back(stringlist[i]);
00164       }
00165       pcre_free_substring_list(stringlist);
00166     }
00167     else {
00168       throw exception(res);
00169     }
00170     return true;
00171   }
00172   else {
00173     /* some other uncommon error occured */
00174     return false;
00175   }
00176 }
00177 
00178 Array* Pcre::get_sub_strings() {
00179   if(resultset != NULL)
00180     return resultset;
00181   else
00182     return NULL;
00183 }
00184 
00185 string Pcre::get_match(int pos) {
00186   if(pos >= 0 && pos < num_matches) {
00187     ArrayIterator P = resultset->begin() + pos;
00188     return *P;
00189   }
00190   else {
00191     throw exception("out of range");
00192   }
00193 }
00194 
00195 int Pcre::get_match_start(int pos) {
00196   if(pos >= 0 && pos <= num_matches) {
00197     /*
00198      * sub_vec[0] and [1] is the start/end of the entire string.
00199      */
00200     return sub_vec[ (++pos) * 2 ];
00201   }
00202   else {
00203     throw exception("out of range");
00204   }  
00205 }
00206 
00207 int Pcre::get_match_end(int pos) {
00208   if(pos >= 0 && pos <= num_matches) {
00209     /*
00210      * the end offset of a subpattern points to
00211      * the first offset of the next substring,
00212      * therefore -1
00213      */
00214     return sub_vec[ ((++pos) * 2) + 1 ] - 1;
00215   }
00216   else {
00217     throw exception("out of range");
00218   }
00219 }
00220 
00221 size_t Pcre::get_match_length(int pos) {
00222   if(pos >= 0 && pos < num_matches) {
00223     ArrayIterator P = resultset->begin() + pos;
00224     return P->length();
00225   }
00226   else {
00227     throw exception("out of range");
00228   }
00229 }
00230 
00231 Array Pcre::_split(const string& piece, int limit, int start_offset, int end_offset) {
00232   Array Splitted;
00233   /* _expression will be used as delimiter */
00234   if(_expression.length() == 1) {
00235     /* use the plain c++ way, ignore the pre-compiled p_pcre */
00236     string buffer, _delimiter, _piece;
00237     char z;
00238     if(case_t) {
00239       z = toupper(_expression[0]);
00240       for(size_t pos=0; pos < piece.length(); pos++) {
00241         _piece += (char)toupper(piece[pos]);
00242       }
00243     }
00244     else {
00245       z = _expression[0];
00246       _piece = piece;
00247     }
00248     for(size_t pos=0; pos<piece.length(); pos++) {
00249       if(_piece[pos] == z) {
00250         Splitted.push_back(buffer);
00251         buffer = "";
00252       }
00253       else {
00254         buffer += piece[pos];
00255       }
00256     }
00257     if(buffer != "") {
00258       Splitted.push_back(buffer);
00259     }
00260   }
00261   else {
00262     /* use the regex way */
00263     if(_expression[0] != '(' && _expression[ _expression.length() ] != ')' ) {
00264       /* oh, oh - the pre-compiled expression does not contain brackets */
00265       pcre_free(p_pcre);
00266       pcre_free(p_pcre_extra);
00267       
00268       pcre       *_p = NULL;
00269       pcre_extra *_e = NULL;;
00270 
00271       p_pcre = _p;
00272       p_pcre_extra = _e;
00273 
00274       _expression = "(" + _expression + ")";
00275       Compile(_flags);
00276     }
00277     int num_pieces=0, pos=0, piece_end = 0, piece_start = 0;
00278     for(;;) {
00279       if(search(piece, pos) == true) {
00280         if(matches() > 0) {
00281           piece_end   = get_match_start(0) - 1;
00282           piece_start = pos;
00283           pos = piece_end + 1 + get_match_length(0);
00284           string junk(piece, piece_start, (piece_end - piece_start)+1);
00285           num_pieces++;
00286           if( (limit != 0 && num_pieces < limit) || limit == 0) {
00287             if( (start_offset != 0 && num_pieces >= start_offset) || start_offset == 0) {
00288               if( (end_offset != 0 && num_pieces <= end_offset) || end_offset == 0) {
00289                 /* we are within the allowed range, so just add the grab */
00290                 Splitted.push_back(junk);
00291               }
00292             }
00293           }
00294         }
00295       }
00296       else {
00297         /* the rest of the string, there are no more delimiters */
00298         string junk(piece, pos, (piece.length() - pos));
00299         num_pieces++;
00300         if( (limit != 0 && num_pieces < limit) || limit == 0) {
00301           if( (start_offset != 0 && num_pieces >= start_offset) || start_offset == 0) {
00302             if( (end_offset != 0 && num_pieces <= end_offset) || end_offset == 0) {
00303               /* we are within the allowed range, so just add the grab */
00304               Splitted.push_back(junk);
00305             }
00306           }
00307         }
00308         break;
00309       }
00310     } // for()
00311   } // if(_expression.length()
00312   return Splitted;
00313 }
00314 
00315 Array Pcre::split(const string& piece) {
00316   return _split(piece, 0, 0, 0);
00317 }
00318 
00319 Array Pcre::split(const string& piece, int limit) {
00320   return _split(piece, limit, 0, 0);
00321 }
00322 
00323 Array Pcre::split(const string& piece, int limit, int start_offset) {
00324   return _split(piece, limit, start_offset, 0);
00325 }
00326 
00327 Array Pcre::split(const string& piece, int limit, int start_offset, int end_offset) {
00328   return _split(piece, limit, start_offset, end_offset);
00329 }
00330 
00331 Array Pcre::split(const string& piece, vector<int> positions) {
00332   Array PreSplitted = _split(piece, 0, 0, 0);
00333   Array Splitted;
00334   for(vector<int>::iterator vecIt=positions.begin(); vecIt != positions.end(); ++vecIt) {
00335     Splitted.push_back(PreSplitted[*vecIt]);
00336   }
00337   return Splitted;
00338 }
00339 
00340 
00341 string Pcre::replace(const string& piece, const string& with) {
00342   string Replaced(piece);
00343 
00344   if(_expression[0] != '(' && _expression[ _expression.length() ] != ')' ) {
00345     /* oh, oh - the pre-compiled expression does not contain brackets */
00346     _expression = "(" + _expression + ")";
00347     /* recreate the p_pcre* objects to avoid memory leaks */
00348     pcre_free(p_pcre);
00349     pcre_free(p_pcre_extra);
00350       
00351     pcre       *_p = NULL;
00352     pcre_extra *_e = NULL;;
00353 
00354     p_pcre = _p;
00355     p_pcre_extra = _e;
00356 
00357     _expression = "(" + _expression + ")";
00358     Compile(_flags);
00359   }
00360 
00361   if(search(piece)) {
00362     /* we found at least one match */
00363     string use_with = _replace_vars(with);
00364     if(!global_t) {
00365       /* only once */
00366       if(matched() && matches() >= 1) {
00367         Replaced.replace(get_match_start(0), (get_match_end(0) - get_match_start(0)) + 1, use_with);
00368       }
00369     }
00370     else {
00371       /* multiple times */
00372       Array Splitted = split(piece);
00373       Replaced = "";
00374       for(size_t pos=0; pos < Splitted.size(); pos++) {
00375         if(pos == (Splitted.size() - 1))
00376           Replaced += Splitted[pos];
00377         else
00378           Replaced += Splitted[pos] + with;
00379       }
00380     }
00381   }
00382   return Replaced;
00383 }
00384 
00385 string Pcre::_replace_vars(const string& piece) {
00386   Pcre dollar("\\$[0-9]+");
00387   string with = piece;
00388   if(dollar.search(with)) {
00389     for(int index=0; index < num_matches; index++) {
00390       /* do it for each existing sub string */
00391       string sub   = get_match(index); // what "$1" resulted
00392       ostringstream num(index+1);
00393       Pcre subsplit(string("(\\$") + num.str() + ")"); // "\\$1"
00394       Array splitted = subsplit.split(with); // normally 2 (or more) parts, the one in front of and the other one after "$1"
00395       string Replaced;
00396       for(size_t pos=0; pos < splitted.size(); pos++) {
00397         if(pos == (splitted.size() - 1))
00398           Replaced += splitted[pos];
00399         else
00400           Replaced += splitted[pos] + sub;
00401       }
00402       with = Replaced; // well, one part is done
00403     }
00404     return with;
00405   }
00406   else {
00407     /* hm, no $[0-9]+ stuff, so just return it untouched */
00408     return with;
00409   }
00410 }

Generated on Sat Feb 9 18:04:10 2002 for PCRE++ by doxygen1.2.13.1 written by Dimitri van Heesch, © 1997-2001