/* c_tokenizer.c */ // Borrowed from http://www.cplusplus.com/faq/sequences/strings/split/ #include #include #include #include "c_tokenizer.h" extern __thread int mysql_thread___query_digests_max_query_length; #include #define bool char extern __thread bool mysql_thread___query_digests_lowercase; extern __thread bool mysql_thread___query_digests_replace_null; extern __thread bool mysql_thread___query_digests_no_digits; void tokenizer(tokenizer_t *result, const char* s, const char* delimiters, int empties ) { //tokenizer_t result; result->s_length = ( (s && delimiters) ? strlen(s) : 0 ); result->s = NULL; if (result->s_length) { if (result->s_length > (PROXYSQL_TOKENIZER_BUFFSIZE-1)) { result->s = strdup(s); } else { strcpy(result->buffer,s); result->s = result->buffer; } } result->delimiters = delimiters; result->current = NULL; result->next = result->s; result->is_ignore_empties = (empties != TOKENIZER_EMPTIES_OK); //return result; } const char* free_tokenizer( tokenizer_t* tokenizer ) { if (tokenizer->s_length > (PROXYSQL_TOKENIZER_BUFFSIZE-1)) { free(tokenizer->s); } tokenizer->s = NULL; return NULL; } const char* tokenize( tokenizer_t* tokenizer ) { if (!tokenizer->s) return NULL; if (!tokenizer->next) return free_tokenizer( tokenizer ); tokenizer->current = tokenizer->next; tokenizer->next = strpbrk( tokenizer->current, tokenizer->delimiters ); if (tokenizer->next) { *tokenizer->next = '\0'; tokenizer->next += 1; if (tokenizer->is_ignore_empties) { tokenizer->next += strspn( tokenizer->next, tokenizer->delimiters ); if (!(*tokenizer->current)) return tokenize( tokenizer ); } } else if (tokenizer->is_ignore_empties && !(*tokenizer->current)) return free_tokenizer( tokenizer ); return tokenizer->current; } void c_split_2(const char *in, const char *del, char **out1, char **out2) { *out1=NULL; *out2=NULL; const char *t; tokenizer_t tok; tokenizer( &tok, in, del, TOKENIZER_NO_EMPTIES ); for ( t=tokenize(&tok); t; t=tokenize(&tok)) { if (*out1==NULL) { *out1=strdup(t); continue; } if (*out2==NULL) { *out2=strdup(t); continue; } } if (*out1==NULL) *out1=strdup(""); if (*out2==NULL) *out2=strdup(""); free_tokenizer( &tok ); } #define SIZECHAR sizeof(char) // check char if it could be table name static inline char is_normal_char(char c) { if(c >= 'a' && c <= 'z') return 1; if(c >= 'A' && c <= 'Z') return 1; if(c >= '0' && c <= '9') return 1; if(c == '$' || c == '_') return 1; return 0; } // token char - not table name string static inline char is_token_char(char c) { return !is_normal_char(c); } // space - it's much easy to remove duplicated space chars static inline char is_space_char(char c) { if(c == ' ' || c == '\t' || c == '\n' || c == '\r') return 1; return 0; } // check digit static inline char is_digit_char(char c) { if(c >= '0' && c <= '9') return 1; return 0; } // check if it can be HEX char static inline char is_hex_char(char c) { if((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) return 1; return 0; } // between pointer, check string is number - need to be changed more functions static char is_digit_string(char *f, char *t) { if(f == t) { if(is_digit_char(*f)) return 1; else return 0; } int is_hex = 0; int i = 0; // 0x, 0X while(f != t) { if(i == 1 && *(f-1) == '0' && (*f == 'x' || *f == 'X')) { is_hex = 1; } // none hex else if(!is_hex && !is_digit_char(*f)) { return 0; } // hex else if(is_hex && !is_hex_char(*f)) { return 0; } f++; i++; } // need to be added function ---------------- // 23e // 23e+1 return 1; } char *mysql_query_digest_and_first_comment(char *s, int _len, char **first_comment, char *buf){ int i = 0; char cur_comment[FIRST_COMMENT_MAX_LENGTH]; cur_comment[0]=0; int ccl=0; int cmd=0; int len = _len; if (_len > mysql_thread___query_digests_max_query_length) { len = mysql_thread___query_digests_max_query_length; } char *r = buf; if (r==NULL) { r = (char *) malloc(len + SIZECHAR); } char *p_r = r; char *p_r_t = r; char prev_char = 0; char qutr_char = 0; char flag = 0; char fc=0; int fc_len=0; char fns=0; bool lowercase=0; bool replace_null=0; bool replace_number=0; lowercase=mysql_thread___query_digests_lowercase; replace_null = mysql_thread___query_digests_replace_null; replace_number = mysql_thread___query_digests_no_digits; while(i < len) { // ================================================= // START - read token char and set flag what's going on. // ================================================= if(flag == 0) { // store current position p_r_t = p_r; // comment type 1 - start with '/*' if(prev_char == '/' && *s == '*') { ccl=0; flag = 1; if (*(s+1)=='!') cmd=1; } // comment type 2 - start with '#' else if(*s == '#') { flag = 2; } // comment type 3 - start with '--' else if(prev_char == '-' && *s == '-' && ((*(s+1)==' ') || (*(s+1)=='\n') || (*(s+1)=='\r') || (*(s+1)=='\t') )) { flag = 3; } else if (*s == '-') { if (prev_char != '-' && i!=(len-1) && ((*(s+1)=='-'))) { flag = 3; } else if (i==0 && ((*(s+1)=='-'))) { flag = 3; } } // string - start with ' else if(*s == '\'' || *s == '"') { flag = 4; qutr_char = *s; } // may be digit - start with digit else if(is_token_char(prev_char) && is_digit_char(*s)) { if (replace_number) { *p_r++ = '?'; while(*s != '\0' && is_digit_char(*s)) { s++; i++; } } else { flag = 5; if(len == i+1) continue; } } // not above case - remove duplicated space char else { flag = 0; if (fns==0 && is_space_char(*s)) { s++; i++; continue; } if (fns==0) fns=1; if(is_space_char(prev_char) && is_space_char(*s)){ prev_char = ' '; *p_r = ' '; s++; i++; continue; } if (replace_number) { if (!is_digit_char(prev_char) && is_digit_char(*s)) { *p_r++ = '?'; while(*s != '\0' && is_digit_char(*s)) { s++; i++; } } } if (replace_null) { if (*s == 'n' || *s == 'N') { // we search for NULL , #2171 if (i && is_token_char(prev_char)) { if (len>=4) { if (i=2) fc_len-=2; char *c=*first_comment+fc_len; *c=0; //*first_comment[fc_len]=0; fc=2; } } } if( // comment type 1 - /* .. */ (flag == 1 && prev_char == '*' && *s == '/') || // comment type 2 - # ... \n (flag == 2 && (*s == '\n' || *s == '\r' || (i == len - 1) )) || // comment type 3 - -- ... \n (flag == 3 && (*s == '\n' || *s == '\r' || (i == len -1) )) ) { p_r = p_r_t; if (flag == 1 || (i == len -1)) { p_r -= SIZECHAR; } if (cmd) { cur_comment[ccl]=0; if (ccl>=2) { ccl-=2; cur_comment[ccl]=0; char el=0; int fcc=0; while (el==0 && fcc= r && ( *(_p+2) == '-' || *(_p+2) == '+') ) { if ( ( *(_p+1) == ',' ) || ( *(_p+1) == '(' ) || ( ( *(_p+1) == ' ' ) && ( *_p == ',' || *_p == '(' ) ) ) { p_r--; } } *p_r++ = '?'; flag = 0; break; } // need to be ignored case if(p_r > p_r_t + SIZECHAR) { if( (prev_char == '\\' && *s == '\\') || // to process '\\\\', '\\' (prev_char == '\\' && *s == qutr_char) || // to process '\'' (prev_char == qutr_char && *s == qutr_char) // to process '''' ) { prev_char = 'X'; s++; i++; continue; } } // satisfied closing string - swap string to ? if(*s == qutr_char && (len == i+1 || *(s + SIZECHAR) != qutr_char)) { char *_p = p_r_t; _p-=3; p_r = p_r_t; if ( _p >= r && ( *(_p+2) == '-' || *(_p+2) == '+') ) { if ( ( *(_p+1) == ',' ) || ( *(_p+1) == '(' ) || ( ( *(_p+1) == ' ' ) && ( *_p == ',' || *_p == '(' ) ) ) { p_r--; } } *p_r++ = '?'; flag = 0; if(i < len) s++; i++; continue; } } // -------- // digit // -------- else if(flag == 5) { // last single char if(p_r_t == p_r) { char *_p = p_r_t; _p-=3; if ( _p >= r && ( *(_p+2) == '-' || *(_p+2) == '+') ) { if ( ( *(_p+1) == ',' ) || ( *(_p+1) == '(' ) || ( ( *(_p+1) == ' ' ) && ( *_p == ',' || *_p == '(' ) ) ) { p_r--; } } *p_r++ = '?'; i++; continue; } // token char or last char if(is_token_char(*s) || len == i+1) { if(is_digit_string(p_r_t, p_r)) { char *_p = p_r_t; _p-=3; p_r = p_r_t; if ( _p >= r && ( *(_p+2) == '-' || *(_p+2) == '+') ) { if ( ( *(_p+1) == ',' ) || ( *(_p+1) == '(' ) || ( ( *(_p+1) == ' ' ) && ( *_p == ',' || *_p == '(' ) ) ) { p_r--; } } *p_r++ = '?'; if(len == i+1) { if(is_token_char(*s)) *p_r++ = *s; i++; continue; } } flag = 0; } } } // ================================================= // COPY CHAR // ================================================= // convert every space char to ' ' if (lowercase==0) { *p_r++ = !is_space_char(*s) ? *s : ' '; } else { *p_r++ = !is_space_char(*s) ? (tolower(*s)) : ' '; } prev_char = *s++; i++; } // remove a trailing space if (p_r>r) { char *e=p_r; e--; if (*e==' ') { *e=0; // maybe 2 trailing spaces . It happens with comments e--; if (*e==' ') { *e=0; } } } *p_r = 0; // process query stats return r; } char *mysql_query_strip_comments(char *s, int _len) { int i = 0; int len = _len; char *r = (char *) malloc(len + SIZECHAR); char *p_r = r; char *p_r_t = r; char prev_char = 0; char flag = 0; char fns=0; bool lowercase=0; lowercase=mysql_thread___query_digests_lowercase; while(i < len) { // ================================================= // START - read token char and set flag what's going on. // ================================================= if(flag == 0) { // store current position p_r_t = p_r; // comment type 1 - start with '/*' if(prev_char == '/' && *s == '*') { flag = 1; } // comment type 2 - start with '#' else if(*s == '#') { flag = 2; } // comment type 3 - start with '--' else if(prev_char == '-' && *s == '-' && ((*(s+1)==' ') || (*(s+1)=='\n') || (*(s+1)=='\r') || (*(s+1)=='\t') )) { flag = 3; } // not above case - remove duplicated space char else { flag = 0; if (fns==0 && is_space_char(*s)) { s++; i++; continue; } if (fns==0) fns=1; if(is_space_char(prev_char) && is_space_char(*s)){ prev_char = ' '; *p_r = ' '; s++; i++; continue; } } } // ================================================= // PROCESS and FINISH - do something on each case // ================================================= else { // -------- // comment // -------- if( // comment type 1 - /* .. */ (flag == 1 && prev_char == '*' && *s == '/') || // comment type 2 - # ... \n (flag == 2 && (*s == '\n' || *s == '\r' || (i == len - 1) )) || // comment type 3 - -- ... \n (flag == 3 && (*s == '\n' || *s == '\r' || (i == len -1) )) ) { p_r = p_r_t; if (flag == 1 || (i == len -1)) { p_r -= SIZECHAR; } prev_char = ' '; flag = 0; s++; i++; continue; } } // ================================================= // COPY CHAR // ================================================= // convert every space char to ' ' if (lowercase==0) { *p_r++ = !is_space_char(*s) ? *s : ' '; } else { *p_r++ = !is_space_char(*s) ? (tolower(*s)) : ' '; } prev_char = *s++; i++; } // remove a trailing space if (p_r>r) { char *e=p_r; e--; if (*e==' ') { *e=0; } } *p_r = 0; return r; }