You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
proxysql/lib/c_tokenizer.c

480 lines
9.0 KiB

/* c_tokenizer.c */
// Borrowed from http://www.cplusplus.com/faq/sequences/strings/split/
#include <stdlib.h>
#include <string.h>
#include "c_tokenizer.h"
extern __thread int mysql_thread___query_digests_max_query_length;
#include <ctype.h>
#define bool char
extern __thread bool mysql_thread___query_digests_lowercase;
void tokenizer(tokenizer_t *result, const char* s, const char* delimiters, int empties )
{
//tokenizer_t result;
result->s_length = ( (s && delimiters) ? strlen(s) : 0 );
result->s = NULL;
if (result->s_length) {
if (result->s_length > (PROXYSQL_TOKENIZER_BUFFSIZE-1)) {
result->s = strdup(s);
} else {
strcpy(result->buffer,s);
result->s = result->buffer;
}
}
result->delimiters = delimiters;
result->current = NULL;
result->next = result->s;
result->is_ignore_empties = (empties != TOKENIZER_EMPTIES_OK);
//return result;
}
const char* free_tokenizer( tokenizer_t* tokenizer )
{
if (tokenizer->s_length > (PROXYSQL_TOKENIZER_BUFFSIZE-1)) {
free(tokenizer->s);
}
tokenizer->s = NULL;
return NULL;
}
const char* tokenize( tokenizer_t* tokenizer )
{
if (!tokenizer->s) return NULL;
if (!tokenizer->next)
return free_tokenizer( tokenizer );
tokenizer->current = tokenizer->next;
tokenizer->next = strpbrk( tokenizer->current, tokenizer->delimiters );
if (tokenizer->next)
{
*tokenizer->next = '\0';
tokenizer->next += 1;
if (tokenizer->is_ignore_empties)
{
tokenizer->next += strspn( tokenizer->next, tokenizer->delimiters );
if (!(*tokenizer->current))
return tokenize( tokenizer );
}
}
else if (tokenizer->is_ignore_empties && !(*tokenizer->current))
return free_tokenizer( tokenizer );
return tokenizer->current;
}
void c_split_2(const char *in, const char *del, char **out1, char **out2) {
*out1=NULL;
*out2=NULL;
const char *t;
tokenizer_t tok;
tokenizer( &tok, in, del, TOKENIZER_NO_EMPTIES );
for ( t=tokenize(&tok); t; t=tokenize(&tok)) {
if (*out1==NULL) { *out1=strdup(t); continue; }
if (*out2==NULL) { *out2=strdup(t); continue; }
}
if (*out1==NULL) *out1=strdup("");
if (*out2==NULL) *out2=strdup("");
free_tokenizer( &tok );
}
#define SIZECHAR sizeof(char)
// check char if it could be table name
static inline char is_normal_char(char c)
{
if(c >= 'a' && c <= 'z')
return 1;
if(c >= 'A' && c <= 'Z')
return 1;
if(c >= '0' && c <= '9')
return 1;
if(c == '$' || c == '_')
return 1;
return 0;
}
// token char - not table name string
static inline char is_token_char(char c)
{
return !is_normal_char(c);
}
// space - it's much easy to remove duplicated space chars
static inline char is_space_char(char c)
{
if(c == ' ' || c == '\t' || c == '\n' || c == '\r')
return 1;
return 0;
}
// check digit
static inline char is_digit_char(char c)
{
if(c >= '0' && c <= '9')
return 1;
return 0;
}
// check if it can be HEX char
static inline char is_hex_char(char c)
{
if((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))
return 1;
return 0;
}
// between pointer, check string is number - need to be changed more functions
static char is_digit_string(char *f, char *t)
{
if(f == t)
{
if(is_digit_char(*f))
return 1;
else
return 0;
}
int is_hex = 0;
int i = 0;
// 0x, 0X
while(f != t)
{
if(i == 1 && *(f-1) == '0' && (*f == 'x' || *f == 'X'))
{
is_hex = 1;
}
// none hex
else if(!is_hex && !is_digit_char(*f))
{
return 0;
}
// hex
else if(is_hex && !is_hex_char(*f))
{
return 0;
}
f++;
i++;
}
// need to be added function ----------------
// 23e
// 23e+1
return 1;
}
char *mysql_query_digest_and_first_comment(char *s, int _len, char **first_comment, char *buf){
int i = 0;
char cur_comment[FIRST_COMMENT_MAX_LENGTH];
cur_comment[0]=0;
int ccl=0;
int cmd=0;
int len = _len;
if (_len > mysql_thread___query_digests_max_query_length) {
len = mysql_thread___query_digests_max_query_length;
}
char *r = buf;
if (r==NULL) {
r = (char *) malloc(len + SIZECHAR);
}
char *p_r = r;
char *p_r_t = r;
char prev_char = 0;
char qutr_char = 0;
char flag = 0;
char fc=0;
int fc_len=0;
char fns=0;
bool lowercase=0;
lowercase=mysql_thread___query_digests_lowercase;
while(i < len)
{
// =================================================
// START - read token char and set flag what's going on.
// =================================================
if(flag == 0)
{
// store current position
p_r_t = p_r;
// comment type 1 - start with '/*'
if(prev_char == '/' && *s == '*')
{
ccl=0;
flag = 1;
if (*(s+1)=='!')
cmd=1;
}
// comment type 2 - start with '#'
else if(*s == '#')
{
flag = 2;
}
// comment type 3 - start with '--'
else if(prev_char == '-' && *s == '-' && ((*(s+1)==' ') || (*(s+1)=='\n') || (*(s+1)=='\r') || (*(s+1)=='\t') ))
{
flag = 3;
}
// string - start with '
else if(*s == '\'' || *s == '"')
{
flag = 4;
qutr_char = *s;
}
// may be digit - start with digit
else if(is_token_char(prev_char) && is_digit_char(*s))
{
flag = 5;
if(len == i+1)
continue;
}
// not above case - remove duplicated space char
else
{
flag = 0;
if (fns==0 && is_space_char(*s)) {
s++;
i++;
continue;
}
if (fns==0) fns=1;
if(is_space_char(prev_char) && is_space_char(*s)){
prev_char = ' ';
*p_r = ' ';
s++;
i++;
continue;
}
}
}
// =================================================
// PROCESS and FINISH - do something on each case
// =================================================
else
{
// --------
// comment
// --------
if (flag == 1) {
if (cmd) {
if (ccl<FIRST_COMMENT_MAX_LENGTH-1) {
cur_comment[ccl]=*s;
ccl++;
}
}
if (fc==0) {
fc=1;
}
if (fc==1) {
if (fc_len<FIRST_COMMENT_MAX_LENGTH-1) {
if (*first_comment==NULL) {
*first_comment=(char *)malloc(FIRST_COMMENT_MAX_LENGTH);
}
char *c=*first_comment+fc_len;
*c = !is_space_char(*s) ? *s : ' ';
fc_len++;
}
if (prev_char == '*' && *s == '/') {
if (fc_len>=2) fc_len-=2;
char *c=*first_comment+fc_len;
*c=0;
//*first_comment[fc_len]=0;
fc=2;
}
}
}
if(
// comment type 1 - /* .. */
(flag == 1 && prev_char == '*' && *s == '/') ||
// comment type 2 - # ... \n
(flag == 2 && (*s == '\n' || *s == '\r' || (i == len - 1) ))
||
// comment type 2 - # ... \n
(flag == 3 && (*s == '\n' || *s == '\r' || (i == len -1) ))
)
{
p_r = p_r_t;
if (flag == 1 || (i == len -1)) {
p_r -= SIZECHAR;
}
if (cmd) {
cur_comment[ccl]=0;
if (ccl>=2) {
ccl-=2;
cur_comment[ccl]=0;
char el=0;
int fcc=0;
while (el==0 && fcc<ccl ) {
switch (cur_comment[fcc]) {
case '/':
case '*':
case '!':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case ' ':
fcc++;
break;
default:
el=1;
break;
}
}
if (el) {
memcpy(p_r,cur_comment+fcc,ccl-fcc);
p_r+=(ccl-fcc);
*p_r++=' ';
}
}
cmd=0;
}
prev_char = ' ';
flag = 0;
s++;
i++;
continue;
}
// --------
// string
// --------
else if(flag == 4)
{
// Last char process
if(len == i + 1)
{
p_r = p_r_t;
*p_r++ = '?';
flag = 0;
break;
}
// need to be ignored case
if(p_r > p_r_t + SIZECHAR)
{
if(
(prev_char == '\\' && *s == '\\') || // to process '\\\\', '\\'
(prev_char == '\\' && *s == qutr_char) || // to process '\''
(prev_char == qutr_char && *s == qutr_char) // to process ''''
)
{
prev_char = 'X';
s++;
i++;
continue;
}
}
// satisfied closing string - swap string to ?
if(*s == qutr_char && (len == i+1 || *(s + SIZECHAR) != qutr_char))
{
p_r = p_r_t;
*p_r++ = '?';
flag = 0;
if(i < len)
s++;
i++;
continue;
}
}
// --------
// digit
// --------
else if(flag == 5)
{
// last single char
if(p_r_t == p_r)
{
*p_r++ = '?';
i++;
continue;
}
// token char or last char
if(is_token_char(*s) || len == i+1)
{
if(is_digit_string(p_r_t, p_r))
{
p_r = p_r_t;
*p_r++ = '?';
if(len == i+1)
{
if(is_token_char(*s))
*p_r++ = *s;
i++;
continue;
}
}
flag = 0;
}
}
}
// =================================================
// COPY CHAR
// =================================================
// convert every space char to ' '
if (lowercase==0) {
*p_r++ = !is_space_char(*s) ? *s : ' ';
} else {
*p_r++ = !is_space_char(*s) ? (tolower(*s)) : ' ';
}
prev_char = *s++;
i++;
}
// remove a trailing space
if (p_r>r) {
char *e=p_r;
e--;
if (*e==' ') {
*e=0;
}
}
*p_r = 0;
// process query stats
return r;
}