Showing posts with label RE2. Show all posts
Showing posts with label RE2. Show all posts

Thursday, May 16, 2013

How to use named group feature in Google RE2 regular expression engine

After some googling and then submitting a question in re2-dev google group, I couldn't find any useful help about using "named groups" in RE2.
for example if you want to extract protocol, url path and query string parts from a given url string, it's good idea to use named group in RE2:

sample urls :
http://localhost:8080/hello?name=Hassan

a sample but not perfect RegEx may be :
^(?P<PROTO>https?)://(?P<URL>.+)\?(?P<QUERY>.+)?$

The best way for getting PROTO, URL and QUERY values is using named group capturing feature in RE2.
this is the Code I wrote for this :

x
bool FindNamedGroups(const std::string &p_regex,const std::string &p_text,std::map<std::string,std::string> *p_group_value)
 {
  p_group_value->clear();
  RE2 rx(p_regex);
  if(!rx.ok())
  {
   std::cerr << "Invalid Regular Expression :" << p_regex << std::endl;
   return false;
  }
  size_t named_grp_size =rx.NumberOfCapturingGroups();
  if(named_grp_size>10)
  {
   std::cerr << "No support for more than 10 named groups :" << named_grp_size<< std::endl;
   return false;
  }
  const std::map<std::string,int> &grp_to_idx=rx.NamedCapturingGroups();
  RE2::Arg args[10];
  std::string vars[10];
  const RE2::Arg * const p_args[10]={&args[0],&args[1],&args[2],&args[3],&args[4],&args[5],&args[6],&args[7],&args[8],&args[9]};
  int var_count=0;

  for(var_count=0;var_count<10;var_count++)
   args[var_count]=&vars[var_count];

  re2::StringPiece sp_input(p_text);
  //after running following function. matched groups value  will be stored in p_args which point to args which point to vars!
  bool found= RE2::FindAndConsumeN(&sp_input,rx,p_args,named_grp_size);
  if(!found)
  {
   return false ;
  }

  std::map<std::string,int>::const_iterator iter_grps=grp_to_idx.cbegin();
  for(;iter_grps!=grp_to_idx.cend();++iter_grps)
  {
   (*p_group_value)[iter_grps->first]=vars[iter_grps->second-1];
  }
  return true;
 
 }
//////////// USAGE ////////////////
FindNamedGroups("^(?P<PROTO>https?)://(?P<URL>.+)\\?(?P<QUERY>.+)?$","http://localhost:8080/hello?name=Hassan",&g_v);
 iter=g_v.cbegin();
 for(;iter!=g_v.cend();++iter)
  std::cout << iter->first << " = " << iter->second << std::endl;
x

Thursday, June 14, 2012

Find all links inside a HTML source using Google RE2

Here is a small code to show you how to find all links inside a HTML source code using Google's Regular Expression library (RE2) :
-----------------------------------------------------------------
string text=" html source goes here <a href='http://google.com/top-l/?search=xyz&x=t'>site link</a> testing <a   href= '../news/show?id=4'>ms</a>";
 re2::RE2 linksre("<a[\s\w]*href=\s*'([\w:/{1,2}.\?=\-&%]*)'[\s\w]*>");
string res;
re2::StringPiece html(text);
while(RE2::FindAndConsume(&html, linksre ,&res))
{
cout << "("<<res<<")"<<endl;
}
-----------------------------------------------------------------

output is :
(http://google.com/top-l/?search=xyz&x=t)
(../news/show?id=4)

Showing posts with label RE2. Show all posts
Showing posts with label RE2. Show all posts

Thursday, May 16, 2013

How to use named group feature in Google RE2 regular expression engine

After some googling and then submitting a question in re2-dev google group, I couldn't find any useful help about using "named groups" in RE2.
for example if you want to extract protocol, url path and query string parts from a given url string, it's good idea to use named group in RE2:

sample urls :
http://localhost:8080/hello?name=Hassan

a sample but not perfect RegEx may be :
^(?P<PROTO>https?)://(?P<URL>.+)\?(?P<QUERY>.+)?$

The best way for getting PROTO, URL and QUERY values is using named group capturing feature in RE2.
this is the Code I wrote for this :

x
bool FindNamedGroups(const std::string &p_regex,const std::string &p_text,std::map<std::string,std::string> *p_group_value)
 {
  p_group_value->clear();
  RE2 rx(p_regex);
  if(!rx.ok())
  {
   std::cerr << "Invalid Regular Expression :" << p_regex << std::endl;
   return false;
  }
  size_t named_grp_size =rx.NumberOfCapturingGroups();
  if(named_grp_size>10)
  {
   std::cerr << "No support for more than 10 named groups :" << named_grp_size<< std::endl;
   return false;
  }
  const std::map<std::string,int> &grp_to_idx=rx.NamedCapturingGroups();
  RE2::Arg args[10];
  std::string vars[10];
  const RE2::Arg * const p_args[10]={&args[0],&args[1],&args[2],&args[3],&args[4],&args[5],&args[6],&args[7],&args[8],&args[9]};
  int var_count=0;

  for(var_count=0;var_count<10;var_count++)
   args[var_count]=&vars[var_count];

  re2::StringPiece sp_input(p_text);
  //after running following function. matched groups value  will be stored in p_args which point to args which point to vars!
  bool found= RE2::FindAndConsumeN(&sp_input,rx,p_args,named_grp_size);
  if(!found)
  {
   return false ;
  }

  std::map<std::string,int>::const_iterator iter_grps=grp_to_idx.cbegin();
  for(;iter_grps!=grp_to_idx.cend();++iter_grps)
  {
   (*p_group_value)[iter_grps->first]=vars[iter_grps->second-1];
  }
  return true;
 
 }
//////////// USAGE ////////////////
FindNamedGroups("^(?P<PROTO>https?)://(?P<URL>.+)\\?(?P<QUERY>.+)?$","http://localhost:8080/hello?name=Hassan",&g_v);
 iter=g_v.cbegin();
 for(;iter!=g_v.cend();++iter)
  std::cout << iter->first << " = " << iter->second << std::endl;
x

Thursday, June 14, 2012

Find all links inside a HTML source using Google RE2

Here is a small code to show you how to find all links inside a HTML source code using Google's Regular Expression library (RE2) :
-----------------------------------------------------------------
string text=" html source goes here <a href='http://google.com/top-l/?search=xyz&x=t'>site link</a> testing <a   href= '../news/show?id=4'>ms</a>";
 re2::RE2 linksre("<a[\s\w]*href=\s*'([\w:/{1,2}.\?=\-&%]*)'[\s\w]*>");
string res;
re2::StringPiece html(text);
while(RE2::FindAndConsume(&html, linksre ,&res))
{
cout << "("<<res<<")"<<endl;
}
-----------------------------------------------------------------

output is :
(http://google.com/top-l/?search=xyz&x=t)
(../news/show?id=4)