// -*- mode:C++; tab-width:2; c-basic-offset:2; indent-tabs-mode:nil -*-
//
// Copyright (C) 2000-2005 by Roger Rene Kommer / artefaktur, Kassel, Germany.
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Library General Public License (LGPL).
//
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// License ACDK-FreeLicense document enclosed in the distribution
// for more for more details.
// This file is part of the Artefaktur Component Development Kit:
// ACDK
//
// Please refer to
// - http://www.acdk.de
// - http://www.artefaktur.com
// - http://acdk.sourceforge.net
// for more information.
//
// $Header: /cvsroot/acdk/acdk/acdk_text/src/acdk/text/RegExp.cpp,v 1.20 2005/03/08 18:49:55 kommer Exp $
#include "text.h"
#include "RegExp.h"
#include "ParseException.h"
#include "ParseException.h"
#include <acdk/io/ObjectReader.h> // for instantiation purposes
#include <acdk/io/MemWriter.h>
#include <acdk/io/BytePtrReader.h>
#include <acdk/locale/UTF8Encoding.h>
#include <acdk/lang/StringInternals.h>
extern "C"
void* acdk_alloc(size_t size)
{
return ::operator new (size);
}
extern "C"
void acdk_free(void* ptr)
{
::operator delete(ptr);
}
namespace acdk {
namespace text {
# define MAX_REGEXARGS 256
RegExp::RegExp(IN(RString) expression, int cflags/* = 0*/)
//throw(RParseException, RThrowable)
: acdk::lang::Object()
, _errorCode(0)
, _pcre_extra(0)
{
const char* errtext = 0;
int offset = -1;
RString expr = expression->convert(CCUtf8);
_pcre = pcre_compile(expr->c_str(), cflags | PCRE_UTF8, &errtext, &offset, 0);
if (_pcre == 0)
THROW1(ParseException, "Error compiling RegExp: [" + expression + "] at " + offset + ": " + errtext);
}
RegExp::~RegExp()
{
if (_pcre != 0)
{
pcre_free(_pcre);
_pcre = 0;
}
}
//virtual
bool
RegExp::test(IN(RString) str, int eflags)
{
int margs[MAX_REGEXARGS];
for (int i = 0; i < MAX_REGEXARGS; ++i)
margs[i] = -1;
acdk::io::MemWriter out;
acdk::locale::UTF8Encoder enc;
enc.encode(&out, str);
int ret = pcre_exec(_pcre, _pcre_extra, (const char*)out.getBuffer()->data(), out.getBuffer()->length(), 0, eflags, margs, MAX_REGEXARGS);
return ret == 0;
//return regexec(&_regexpr, str->c_str(), 0, 0, eflags) != REG_NOMATCH;
}
inline
int
utf8ByteToCharOffset(const char* begin, int offset)
{
Utf8CharIterator it1(begin);
Utf8CharIterator it2(begin + offset);
return it2 - it1;
}
void
utf8ByteToCharOffsets(const char* begin, int offscount, int* margs)
{
for (int i = 0; i < offscount; ++i)
{
margs[i * 2] = utf8ByteToCharOffset(begin, margs[i * 2]);
margs[i * 2 + 1] = utf8ByteToCharOffset(begin, margs[i * 2 + 1]);
}
}
int
RegExp::_matchSize(const char* it, const char* end, int eflags)
{
int margs[MAX_REGEXARGS];
for (int i = 0; i < MAX_REGEXARGS; ++i)
margs[i] = -1;
int erg = pcre_exec(_pcre, _pcre_extra, it, end - it, 0, eflags, margs, MAX_REGEXARGS);
if (erg < 0)
return -1;
int mcount = 0;
for (mcount = 0; mcount < MAX_REGEXARGS; ++mcount)
{
if (margs[mcount] == -1)
break;
}
utf8ByteToCharOffsets(it, mcount, margs);
return margs[1] - margs[0];
}
int
RegExp::matchSize(IN(RString) text, int eflags)
{
acdk::io::MemWriter out;
acdk::locale::UTF8Encoder enc;
enc.encode(&out, text);
const char* begin = (const char*)out.getBuffer()->data();
const char* end = begin + out.getBuffer()->length();
return _matchSize(begin, end, eflags);
}
int
RegExp::_match(const char* it, const char* end, int* slots, int slotnum, int eflags)
{
for (int i = 0; i < slotnum; ++i)
slots[i] = -1;
int erg = pcre_exec(_pcre, _pcre_extra, it, end - it, 0, eflags, slots, slotnum);
if (erg < 0)
return -1;
int mcount = 0;
for (mcount = 0; mcount < MAX_REGEXARGS; ++mcount)
{
if (slots[mcount] == -1)
break;
}
utf8ByteToCharOffsets(it, mcount, slots);
return mcount / 2;
}
int
RegExp::match(IN(RString) text, int* slots, int slotnum, int eflags)
{
acdk::io::MemWriter out;
acdk::locale::UTF8Encoder enc;
enc.encode(&out, text);
const char* begin = (const char*)out.getBuffer()->data();
const char* end = begin + out.getBuffer()->length();
return _match(begin, end, slots, slotnum, eflags);
}
//virtual
RStringArray
RegExp::match(IN(RString) str, int eflags)
{
int margs[MAX_REGEXARGS];
int mcount = match(str, margs, MAX_REGEXARGS, eflags);
if (mcount == -1)
return new StringArray(0);
RStringArray sa = new StringArray(mcount);
for (int i = 0; i < mcount; ++i)
{
sa[i] = str->substr(margs[i * 2], margs[i * 2 + 1]);
}
return sa;
}
//virtual
RRegExpMatchPositionArray
RegExp::matchPos(IN(RString) str, int eflags)
{
int margs[MAX_REGEXARGS];
int mcount = match(str, margs, MAX_REGEXARGS, eflags);
if (mcount == -1)
return new RegExpMatchPositionArray(0);
RRegExpMatchPositionArray ret = new RegExpMatchPositionArray(mcount);
for (int i = 0; i < mcount; ++i)
{
ret[i] = new RegExpMatchPosition(margs[i * 2], margs[i * 2 + 1]);
}
return ret;
}
RString
RegExp::replace(IN(RString) text, IN(RString) replwith, bool replaceAll)
{
RString rest = text;
RRegExpMatchPositionArray sa = matchPos(rest);
RString erg;
while (sa != Nil && sa->length() > 0)
{
//System::out << sa << endln;
erg = erg + rest->substr(0, sa[0]->start) + replwith;
rest = rest->substr(sa[0]->end);
if (replaceAll == false)
break;
sa = matchPos(rest);
}
erg = erg + rest;
return erg;
}
//static
RString
RegExp::escape(IN(RString) str)
{
StringBuffer sb;
String::iterator it = str->begin();
String::iterator end = str->end();
while (it < end)
{
if ((*it >= 'a' && *it <= 'z') ||
(*it >= 'A' && *it <= 'Z') ||
(*it >= '0' && *it <= '9')
)
sb.append(*it);
else
{
sb.append('\\');
sb.append(*it);
}
++it;
}
return sb.toString();
}
} // namespace text
} // namespace acdk
|