#include #include #include #include #include "c_tokenizer.h" extern __thread int mysql_thread___query_digests_max_query_length; extern __thread bool mysql_thread___query_digests_lowercase; extern __thread bool mysql_thread___query_digests_replace_null; extern __thread bool mysql_thread___query_digests_no_digits; extern __thread bool mysql_thread___query_digests_grouping_limit; extern __thread bool mysql_thread___query_digests_groups_grouping_limit; extern __thread bool mysql_thread___query_digests_keep_comment; void tokenizer(tokenizer_t *result, const char* s, const char* delimiters, int empties ) { //tokenizer_t result; result->s_length = ( (s && delimiters) ? strlen(s) : 0 ); result->s = NULL; if (result->s_length) { if (result->s_length > (PROXYSQL_TOKENIZER_BUFFSIZE-1)) { result->s = strdup(s); } else { strcpy(result->buffer,s); result->s = result->buffer; } } result->delimiters = delimiters; result->current = NULL; result->next = result->s; result->is_ignore_empties = (empties != TOKENIZER_EMPTIES_OK); //return result; } const char* free_tokenizer( tokenizer_t* tokenizer ) { if (tokenizer->s_length > (PROXYSQL_TOKENIZER_BUFFSIZE-1)) { free(tokenizer->s); } tokenizer->s = NULL; return NULL; } const char* tokenize( tokenizer_t* tokenizer ) { if (!tokenizer->s) return NULL; if (!tokenizer->next) return free_tokenizer( tokenizer ); tokenizer->current = tokenizer->next; tokenizer->next = strpbrk( tokenizer->current, tokenizer->delimiters ); if (tokenizer->next) { *tokenizer->next = '\0'; tokenizer->next += 1; if (tokenizer->is_ignore_empties) { tokenizer->next += strspn( tokenizer->next, tokenizer->delimiters ); if (!(*tokenizer->current)) return tokenize( tokenizer ); } } else if (tokenizer->is_ignore_empties && !(*tokenizer->current)) return free_tokenizer( tokenizer ); return tokenizer->current; } void c_split_2(const char *in, const char *del, char **out1, char **out2) { *out1=NULL; *out2=NULL; const char *t; tokenizer_t tok; tokenizer( &tok, in, del, TOKENIZER_NO_EMPTIES ); for ( t=tokenize(&tok); t; t=tokenize(&tok)) { if (*out1==NULL) { *out1=strdup(t); continue; } if (*out2==NULL) { *out2=strdup(t); continue; } } if (*out1==NULL) *out1=strdup(""); if (*out2==NULL) *out2=strdup(""); free_tokenizer( &tok ); } #define SIZECHAR sizeof(char) // check char if it could be table name static inline char is_normal_char(char c) { if(c >= 'a' && c <= 'z') return 1; if(c >= 'A' && c <= 'Z') return 1; if(c >= '0' && c <= '9') return 1; if(c == '$' || c == '_') return 1; return 0; } // token char - not table name string static inline char is_token_char(char c) { return !is_normal_char(c); } // space - it's much easy to remove duplicated space chars static inline char is_space_char(char c) { if(c == ' ' || c == '\t' || c == '\n' || c == '\r') return 1; return 0; } // check digit static inline char is_digit_char(char c) { if(c >= '0' && c <= '9') return 1; return 0; } // check if it can be HEX char static inline char is_hex_char(char c) { if((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) return 1; return 0; } // between pointer, check string is number - need to be changed more functions // TODO: f-1 shouldn't be access if 'f' is the first position supplied, could lead to // buffer overflow. NOTE: This is now addressed by 'is_digit_string_2'. static char is_digit_string(char *f, char *t) { if(f == t) { if(is_digit_char(*f)) return 1; else return 0; } int is_hex = 0; int i = 0; // 0x, 0X, n.m, nE+m, nE-m, Em while(f != t) { char is_float = *f == '.' || tolower(*f) == 'e' || (tolower(*(f-1)) == 'e' && (*f == '+' || *f == '-')); if(i == 1 && *(f-1) == '0' && (*f == 'x' || *f == 'X')) { is_hex = 1; } // none hex else if(!is_hex && !is_digit_char(*f) && is_float == 0) { return 0; } // hex else if(is_hex && !is_hex_char(*f)) { return 0; } f++; i++; } return 1; } static inline char is_arithmetic_op(char op) { if (op == '+') { return 1; } else if (op == '-') { return 1; } else if (op == '*') { return 1; } else if (op == '/') { return 1; } else if (op == '%') { return 1; } else { return 0; } } static inline void replace_with_q_mark( char grouping_digest, int grouping_lim, int* grouping_count, char** p_r, char* grouping_limit_exceeded ) { if (*grouping_count < grouping_lim) { **p_r = '?'; (*p_r)++; if (grouping_digest) { *grouping_count += 1; } } else { if (!(*grouping_limit_exceeded)) { **p_r = '.'; (*p_r)++; **p_r = '.'; (*p_r)++; **p_r = '.'; (*p_r)++; *grouping_limit_exceeded=1; } else { // since delimiters are always copied, if 'grouping_lim' is exceeded, we remove any extra ',' // that have been copied after the previously placed '...'. // // NOTE: Avoid copying delimiters in case of query grouping can lead to commas not being copied // before values not being replaced, like 'NULL' values. if (*(*p_r - 1) == ',') { (*p_r)--; } } } } /** * @brief Helper functiont that initializes the supplied 'options' struct with the configuration variables * values. * * @param opts The options struct to be initialized. */ static inline void get_mysql_options(options* opts) { opts->lowercase = mysql_thread___query_digests_lowercase; opts->replace_null = mysql_thread___query_digests_replace_null; opts->replace_number = mysql_thread___query_digests_no_digits; opts->grouping_limit = mysql_thread___query_digests_grouping_limit; opts->groups_grouping_limit = mysql_thread___query_digests_groups_grouping_limit; opts->keep_comment = mysql_thread___query_digests_keep_comment; opts->max_query_length = mysql_thread___query_digests_max_query_length; } /** * @brief Enum holding all the states responsible for value parsing using during 'stage 1' parsing. */ enum p_st { st_no_mark_found = 0, st_cmnt_type_1 = 1, st_cmnt_type_2 = 2, st_cmnt_type_3 = 3, st_literal_string = 4, st_literal_number = 5, st_replace_null = 6 }; /** * @brief Parsing information from received query and the result buffer shared between the different * processing stages. */ typedef struct shared_st { /* @brief Global computed compression offset from the previous iteration. Used when uncompressed query exceeds the maximum buffer side specified by `mysql_thread___query_digests_max_query_length` */ int gl_c_offset; /* @brief Maximum length of the resulting digest. */ int d_max_len; /* @brief Pointer to current reading position of the supplied query. */ const char* q; /* @brief Length of the supplied query. */ int q_len; /* @brief Current position of the iteration over the supplied queried. */ int q_cur_pos; /* @brief Pointer to the initial position of the result buffer. */ char* res_init_pos; /* @brief Pointer to the initial position of the result buffer *for current processing iteration* */ char* res_it_init_pos; /* @brief Current position of the iteration over the return buffer. */ char* res_cur_pos; /* @brief Position in the return buffer prior to the start of any parsing st that isn't 'no_mark_found'. */ char* res_pre_pos; /* @brief The current state being processed by 'stage_1'. */ enum p_st st; /* @brief Last copied char to the result buffer. */ char prev_char; /* @brief Preserve currently imposed 'prev_char' in **on current** char processing instead of replacing it. */ bool keep_prev_char; /* @brief Decides whether or not the next char should be copy during 'stage_1'. */ bool copy_next_char; } shared_st; /** * @brief State used for parsing 'type_1' comments, i.e: /\* *\/. */ typedef struct cmnt_type_1_st { /* @brief Flag to announce if the found comment is a 'cmd' comment. */ bool is_cmd; /* @brief Counter holding the length of the 'cmd' comment currently being processed. */ int cur_cmd_cmnt_len; /** * @brief Flag showing first comment parsing state. '0' when no comment or end has been found, and '1' * when the first comment has already been found. * @details This flag is NEVER reset, since only the first found comment is retrieved for being further * processed. */ int fst_cmnt_end; /* @brief Counter keeping track of the number of chars copied into 'first_comment' buffer. */ int fst_cmnt_len; } cmnt_type_1_st; /** * @brief State used for parsing 'literal strings' values, i.e: 'foo', "bar", etc.. */ typedef struct literal_string_st { /** * @brief Boolean flag showing if the first delimiter from a literal string has been found. * '0' when hasn't yet been found, and '1' while in the processing a literal string. */ int delim_num; /* @brief Found char delimiter found for the literal string being processed. */ char delim_char; const char* q_start_pos; } literal_string_st; /** * @brief State used for parsing 'literal digit' values, e.g: 84, 0x100, 1E-10, etc... */ typedef struct literal_digit_st { bool first_digit; char* start_pos; } literal_digit_st; /** * @brief Created for an alternative implementation of NULL parsing. * Currently unused. TODO: Remove. */ typedef struct literal_null_st { int null_pos; } literal_null_st; /** * @brief State used for 'stage_1' parsing. */ typedef struct stage_1_st { struct cmnt_type_1_st cmnt_type_1_st; struct literal_string_st literal_str_st; struct literal_digit_st literal_digit_st; /* @brief Holds the previous iteration parsing ending position. */ char* pre_it_pos; /** * @brief Previous iteration parsing ending position for 'stage_1'. * @details This position should be kept as the 'stage_1' final position may differ from final positions * for later stages. This event takes place when 'stage_1' hasn't finished parsing a value which * requires copying (i.e. a 'number literal') and digest buffer runned out of space. Under this * circunstance, later stages don't process the 'number literal' interval, but copy it's values in case * that a later 'stage_1' iteration can resume the literal parsing. */ char* new_end_pos; } stage_1_st; /** * @brief Holds the state used for 'stage_2' parsing. */ typedef struct stage_2_st { /* @brief Previous iteration last parsing position in the result buffer, after the stage compression has taken place. */ char* pre_it_pos; /* @brief Last iteration computed compression offset resulted after stage processing. */ int c_offset; } stage_2_st; typedef struct stage_3_st { /* @brief Previous iteration last parsing position in the result buffer, after the stage compression has taken place. */ char* pre_it_pos; /* @brief Last iteration computed compression offset resulted after stage processing. */ int c_offset; } stage_3_st; typedef struct stage_4_st { /* @brief Previous iteration last parsing position in the result buffer, after the stage compression has taken place. */ char* pre_it_pos; /* @brief Last iteration computed compression offset resulted after stage processing. */ int c_offset; } stage_4_st; static __attribute__((always_inline)) inline void init_shared_st(struct shared_st* shared_st, const char* const q, int q_len, int d_max_len, char* res) { shared_st->q = q; shared_st->q_len = q_len; shared_st->d_max_len = d_max_len; // all position start with the beginning of the result buffer shared_st->res_init_pos = res; shared_st->res_it_init_pos = res; shared_st->res_cur_pos = res; shared_st->res_pre_pos = res; // initial state for the first stage state machine shared_st->st = st_no_mark_found; } static __attribute__((always_inline)) inline void init_stage_1_st(struct stage_1_st* fst_stage_st) { fst_stage_st->literal_digit_st.first_digit = 1; } static inline int get_digest_max_len(int len, int max_query_length) { int digest_max_len = 0; if (len > max_query_length) { digest_max_len = max_query_length; } else { digest_max_len = len; } return digest_max_len; } static inline char* get_result_buffer(int len, char* buf) { char* r = NULL; if (buf == NULL) { r = (char *) malloc(len + SIZECHAR); } else { r = buf; } return r; } /** * @brief Return the next st to be processed. State filtering based on end of query being reached is also * performed here. * * @param shared_st The shared processing state used to decide which is the next 'processing state'. * * @return The next processing state. */ static __attribute__((always_inline)) inline enum p_st get_next_st(const options* opts, struct shared_st* shared_st) { char prev_char = shared_st->prev_char; enum p_st st = st_no_mark_found; // cmnt type 1 - start with '/*' if( // v1_crashing_payload_05 shared_st->q_cur_pos < (shared_st->q_len - 2) && *shared_st->q == '/' && *(shared_st->q+1) == '*' ) { st = st_cmnt_type_1; } // cmnt type 2 - start with '#' else if(*shared_st->q == '#') { st = st_cmnt_type_2; } // cmnt type 3 - start with '--' else if ( // shared_st->query isn't over, need to check next character shared_st->q_cur_pos < (shared_st->q_len - 2) && // found starting pattern '-- ' (space is required) *shared_st->q == '-' && *(shared_st->q+1) == '-' && is_space_char(*(shared_st->q+2)) ) { if (prev_char != '-') { st = st_cmnt_type_3; } else if (shared_st->q_cur_pos == 0) { st = st_cmnt_type_3; } } // string - start with ' else if (*shared_st->q == '\'' || *shared_st->q == '"') { st = st_literal_string; } // may be digit - start with digit else if (is_token_char(prev_char) && is_digit_char(*shared_st->q)) { st = st_literal_number; } // NULL processing else if ( is_token_char(shared_st->prev_char) && (*shared_st->q == 'n' || *shared_st->q == 'N') && opts->replace_null ) { st = st_replace_null; } return st; } static __attribute__((always_inline)) inline void inc_proc_pos(shared_st* shared_st) { if (shared_st->keep_prev_char == false) { shared_st->prev_char = *shared_st->q; } else { shared_st->keep_prev_char = false; } shared_st->q++; shared_st->q_cur_pos++; } /** * @brief Copy the next character and increment the current processing position. * * @param opts Options that determine how the next character is going to be copied. * @param shared_st The shared state to modify. */ static __attribute__((always_inline)) inline void copy_next_char(shared_st* shared_st, const options* opts) { // copy the next character; translating any space char into ' ' if (opts->lowercase==0) { *shared_st->res_cur_pos++ = !is_space_char(*shared_st->q) ? *shared_st->q : ' '; } else { *shared_st->res_cur_pos++ = !is_space_char(*shared_st->q) ? tolower(*shared_st->q) : ' '; } inc_proc_pos(shared_st); } static thread_local char cur_cmd_cmnt[FIRST_COMMENT_MAX_LENGTH]; /** * @brief Safer version of 'is_digit_string' performing boundary checks. * * @param shared_st The shared state used for the boundary checks. * @param f Initial position of the string being checked. * @param t Final position of the string being checked. * * @return '1' if the supplied string is recognized as a 'digit_string', '0' otherwise. */ static char is_digit_string_2(shared_st* shared_st, char *f, char *t) { if(f == t) { if(is_digit_char(*f)) return 1; else return 0; } int is_hex = 0; int i = 0; // 0x, 0X, n.m, nE+m, nE-m, Em while(f != t) { char is_float = 0; if (f > shared_st->res_init_pos) { is_float = *f == '.' || tolower(*f) == 'e' || (tolower(*(f-1)) == 'e' && (*f == '+' || *f == '-')); } else { is_float = *f == '.' || tolower(*f) == 'e'; } if(f > shared_st->res_init_pos && i == 1 && *(f-1) == '0' && (*f == 'x' || *f == 'X')) { is_hex = 1; } // none hex else if(!is_hex && !is_digit_char(*f) && is_float == 0) { return 0; } // hex else if(is_hex && !is_hex_char(*f)) { return 0; } f++; i++; } return 1; } /** * @brief Process a detected comment of type "/\* *\/". Determines when to exit the 'st_cmnt_type_1' state. * @details Function assumes that 'shared_st->q' is pointing to the initial mark '/' of the comment start, and * that it's safe to look forward for '*'. State 'st_cmnt_type_1' doesn't copy any data to the result * buffer, unless the comment is a 'cmd' comment, in which case the comment is copied from the query to the * resulting buffer **after** the comment final delimiter '*\/' has been found. * * @param shared_st Shared state used to continue the query processing. * @param c_t_1_st The 'comment_type_1' parsing state, holds the found information about the comment being parsed. * * @return The next processing state, it could be either: * - 'st_cmnt_type_1' if the comment hasn't yet completed to be parsed. * - 'st_no_mark_found' if the comment has completed to be parsed. */ static __attribute__((always_inline)) inline enum p_st process_cmnt_type_1(const options* opts, shared_st* shared_st, cmnt_type_1_st* c_t_1_st, char** fst_cmnt) { enum p_st next_st = st_cmnt_type_1; const char* res_final_pos = shared_st->res_init_pos + shared_st->d_max_len; // initial mark "/*|/*!" detection // comments are not copied by while processed, boundary checks should rely on 'q_cur_pos' and 'q_len'. if (shared_st->q_cur_pos <= (shared_st->q_len-2) && *shared_st->q == '/' && *(shared_st->q+1) == '*') { c_t_1_st->cur_cmd_cmnt_len = 0; // check length before accessing beyond 'q_cur_pos + 1' if (shared_st->q_cur_pos != (shared_st->q_len-2) && *(shared_st->q+2) == '!') { c_t_1_st->is_cmd = 1; } // copy the initial mark "/*" if comment preserving is enabled if (opts->keep_comment) { cur_cmd_cmnt[c_t_1_st->cur_cmd_cmnt_len] = *(shared_st->q); c_t_1_st->cur_cmd_cmnt_len++; cur_cmd_cmnt[c_t_1_st->cur_cmd_cmnt_len] = *(shared_st->q + 1); c_t_1_st->cur_cmd_cmnt_len++; } // discard processed "/*" or "/*!" shared_st->q += 2 + c_t_1_st->is_cmd; shared_st->q_cur_pos += 2 + c_t_1_st->is_cmd; // v1_crashing_payload_04 if (shared_st->q_cur_pos >= shared_st->q_len - 1) { return st_no_mark_found; } } // TODO: Check if there is exclusion between this regular first comments and first comment that are 'cmd' // comments by spec. To further clarify, should comments '/*!' be not considered first comments to be copied // into the supplied 'fst_cmnt' memory? Or should they be considered for further processing? // { // we are parsing a "/*!" comment if (c_t_1_st->is_cmd || (c_t_1_st->is_cmd == false && opts->keep_comment)) { // copy the char into 'cur_cmd_cmnt' if (c_t_1_st->cur_cmd_cmnt_len < FIRST_COMMENT_MAX_LENGTH-1) { cur_cmd_cmnt[c_t_1_st->cur_cmd_cmnt_len] = *shared_st->q; c_t_1_st->cur_cmd_cmnt_len++; } } // first comment hasn't finished, we are yet copying it if (c_t_1_st->fst_cmnt_end == 0) { // copy the char into 'fst_cmnt' buffer if (c_t_1_st->fst_cmnt_len < FIRST_COMMENT_MAX_LENGTH-1) { if (*fst_cmnt == NULL) { // initialize the 'first_comment' and set a final NULL terminator for safety *fst_cmnt = (char*)malloc(FIRST_COMMENT_MAX_LENGTH); *(*fst_cmnt + FIRST_COMMENT_MAX_LENGTH - 1) = 0; } char* next_fst_cmnt_char = *fst_cmnt + c_t_1_st->fst_cmnt_len; *next_fst_cmnt_char = !is_space_char(*shared_st->q) ? *shared_st->q : ' '; c_t_1_st->fst_cmnt_len++; } // detect comment end for first comment type if (shared_st->prev_char == '*' && *shared_st->q == '/') { // remove last two chars from length if it's at least size '2'. if (c_t_1_st->fst_cmnt_len >= 2) { c_t_1_st->fst_cmnt_len -= 2; } // set 'zero' at the end of comment and set finish flag 'fst_cmnt_end'. char* c_end = *fst_cmnt + c_t_1_st->fst_cmnt_len; *c_end = 0; c_t_1_st->fst_cmnt_end = 1; } } // } // comment type 1 - /* .. */ if (shared_st->prev_char == '*' && *shared_st->q == '/') { if (c_t_1_st->is_cmd || (c_t_1_st->is_cmd == false && opts->keep_comment)) { cur_cmd_cmnt[c_t_1_st->cur_cmd_cmnt_len]=0; if (c_t_1_st->cur_cmd_cmnt_len >= 2) { // we are not interested into copying the final '*/' for the comment if (opts->keep_comment == false) { c_t_1_st->cur_cmd_cmnt_len -= 2; } cur_cmd_cmnt[c_t_1_st->cur_cmd_cmnt_len] = 0; // counter for the lenght of the cmd comment annotation, with format `/*!12345 ... */`. int cmnt_annot_len = 0; bool end = 0; // count the number of chars found before annotation ends while (end == 0 && cmnt_annot_len < c_t_1_st->cur_cmd_cmnt_len) { if ( cur_cmd_cmnt[cmnt_annot_len] == '/' || cur_cmd_cmnt[cmnt_annot_len] == '*' || cur_cmd_cmnt[cmnt_annot_len] == '!' || cur_cmd_cmnt[cmnt_annot_len] == ' ' || is_digit_char(cur_cmd_cmnt[cmnt_annot_len]) ) { cmnt_annot_len += 1; } else { end = 1; } } // copy the cmd comment minus the annotation and the marks if (end) { // check if the comment to be copied is going to fit in the target buffer int res_free_space = res_final_pos - shared_st->res_cur_pos; int comment_size = 0; if (opts->keep_comment) { comment_size = c_t_1_st->cur_cmd_cmnt_len; } else { comment_size = c_t_1_st->cur_cmd_cmnt_len - cmnt_annot_len; } int copy_length = res_free_space > comment_size ? comment_size : res_free_space; if (opts->keep_comment) { memcpy(shared_st->res_cur_pos, cur_cmd_cmnt, copy_length); } else { memcpy(shared_st->res_cur_pos, cur_cmd_cmnt + cmnt_annot_len, copy_length); } shared_st->res_cur_pos += copy_length; // The extra space is due to the removal of '*/', this is relevant because the // comment can be in the middle of the query. if (*(shared_st->res_cur_pos - 1 ) != ' ' && shared_st->res_cur_pos != res_final_pos) { *shared_st->res_cur_pos++ = ' '; } } } // Re-initialize the comment state c_t_1_st->is_cmd = 0; c_t_1_st->cur_cmd_cmnt_len = 0; } if ( // not at the beginning or at the end of the query shared_st->res_init_pos != shared_st->res_cur_pos && shared_st->res_cur_pos != res_final_pos && // if the prev copied char isn't a space comment wasn't space separated in the query: // ``` // Q: `SELECT/*FOO*/1` // ^ no space char // ``` // thus we impose an extra space in replace for the ommited comment *(shared_st->res_cur_pos-1) != ' ' ) { *shared_st->res_cur_pos++ = ' '; } // if there were no space we have imposed it shared_st->prev_char = ' '; // back to main shared_st->query parsing state next_st = st_no_mark_found; // reset the comment processing state (v1_crashing_payload_04) c_t_1_st->is_cmd = 0; // skip ending mark for comment for next iteration shared_st->q_cur_pos += 1; shared_st->q++; } return next_st; } /** * @brief Handles the processing state 'st_cmnt_type_2'. * @details State 'st_cmnt_type_2' doesn't copy any data to the result buffer. It just skip the current char * by char until finding the delimiter. * * @param shared_st Shared state used to continue the query processing. * * @return The next processing state, it could be either: * - 'st_cmnt_type_2' if the comment hasn't yet completed to be parsed. * - 'st_no_mark_found' if the comment has completed to be parsed. */ static __attribute__((always_inline)) inline enum p_st process_cmnt_type_2(shared_st* shared_st) { enum p_st next_state = st_cmnt_type_2; // discard processed "#" (v1_crashing_payload_02) if (*shared_st->q == '#' && shared_st->q_cur_pos <= (shared_st->q_len - 2)) { shared_st->q += 1; shared_st->q_cur_pos += 1; } if (*shared_st->q == '\n' || *shared_st->q == '\r' || (shared_st->q_cur_pos >= shared_st->q_len - 1)) { next_state = st_no_mark_found; shared_st->prev_char = ' '; shared_st->q++; shared_st->q_cur_pos++; } return next_state; } /** * @brief Handles the processing state 'st_cmnt_type_3'. * @details State 'st_cmnt_type_2' doesn't copy any data to the result buffer. It just skip the current char * by char until finding the delimiter. * * @param shared_st Shared state used to continue the query processing. * * @return The next processing state, it could be either: * - 'st_cmnt_type_3' if the comment hasn't yet completed to be parsed. * - 'st_no_mark_found' if the comment has completed to be parsed. */ static __attribute__((always_inline)) inline enum p_st process_cmnt_type_3(shared_st* shared_st) { enum p_st next_state = st_cmnt_type_3; // discard processed "-- " if ( shared_st->q_cur_pos <= (shared_st->q_len - 4) && *shared_st->q == '-' && *(shared_st->q+1)=='-' && is_space_char(*(shared_st->q+2)) ) { shared_st->q += 3; shared_st->q_cur_pos += 3; } if (*shared_st->q == '\n' || *shared_st->q == '\r' || (shared_st->q_cur_pos >= shared_st->q_len - 1)) { next_state = st_no_mark_found; shared_st->prev_char = ' '; shared_st->q++; shared_st->q_cur_pos++; } return next_state; } /** * @brief Handles the processing state 'st_literal_string'. * @details State 'st_literal_string' doesn't copy any data to the result buffer, instead, it just skips the * current char until the end delimiter is found. Then replaces the previous position in the result buffer * with the mark '?'. * * TODO: This function currently doesn't take into account if 'NO_BACKSLASH_ESCAPES' sql_mode is being used. * This can lead to 'stats_mysql_query_digest' pollution because a valid query could be received with strings * ending in '\''. With current implementation this final '\'' will be collapsed, leading to a string not * properly finding the target string delimiter. To solve this scenario the following approach could be taken: * - Add an additional parameter to 'Query_Info::begin' that propagates 'no_backslash_escapes' from * 'MySQL_Session::client_myds::myconn::options' through 'Query_Info::query_parser_init'. * - Add a new parameter to 'query_parser_init' (or reuse currently unused 'flags' for this purpose). * - Add a new parameter to 'mysql_query_digest_and_first_comment_2' to propagate this flags. * - Add a new field into the 'options' struct defined in this file for holding such flags. * - Pass 'options' already received by 'stage_1_parsing' into this function and make use of it for deciding * whether to ignore the processing of chars within the string when are preceded by '\'. * This is just a proposal and a future implementation may be subject to change. * * @param shared_st Shared state used to continue the query processing. * @param str_st The literal string parsing state, holds the information so far found about the state. * * @return The next processing state, it could be either: * - 'st_literal_string' if the string literal hasn't yet completed to be parsed. * - 'st_no_mark_found' if the string literal has completed to be parsed. */ static __attribute__((always_inline)) inline enum p_st process_literal_string(shared_st* shared_st, literal_string_st* str_st) { enum p_st next_state = st_literal_string; // process the first delimiter if (str_st->delim_num == 0) { // store found delimiter str_st->q_start_pos = shared_st->q; str_st->delim_char = *shared_st->q; str_st->delim_num = 1; // NOTE: Don't increment the position in query buffer, as explained in 'stage_1_parsing'. return next_state; } // need to be ignored case if(shared_st->q > str_st->q_start_pos + SIZECHAR) { if( (shared_st->prev_char == '\\' && *shared_st->q == '\\') || // to process '\\\\', '\\' (shared_st->prev_char == '\\' && *shared_st->q == str_st->delim_char) || // to process '\'' (shared_st->prev_char == str_st->delim_char && *shared_st->q == str_st->delim_char) // to process '''' ) { shared_st->keep_prev_char = true; shared_st->prev_char = 'X'; // NOTE: Don't increment the position in query buffer. See 'stage_1_parsing' doc. return next_state; } } // satisfied closing string - swap string to ? if( *shared_st->q == str_st->delim_char && (shared_st->q_len == shared_st->q_cur_pos+1 || *(shared_st->q + SIZECHAR) != str_st->delim_char) ) { // NOTE: may not be necessary since we don't increment 'res_cur_pos' during this state. Since all the // characters are ignored. shared_st->res_cur_pos = shared_st->res_pre_pos; // place the replacement mark *shared_st->res_cur_pos++ = '?'; shared_st->prev_char = '?'; // don't copy this char if last if (shared_st->q_len == shared_st->q_cur_pos + 1) { shared_st->copy_next_char = 0; // keep the same state, no token was found return next_state; } // reinit the string literal state str_st->delim_char = 0; str_st->delim_num = 0; str_st->q_start_pos = 0; // update the shared state shared_st->prev_char = str_st->delim_char; if(shared_st->q_cur_pos < shared_st->q_len) { shared_st->q++; } shared_st->q_cur_pos++; // exit the literal parsing state next_state = st_no_mark_found; } return next_state; } /** * @brief Handles the processing state 'st_literal_digit'. * * @param shared_st Shared state used to continue the query processing. * @param digit_st The literal digit parsing state, holds the information so far found about the state. * @param opts TODO: Currently unused, remove. * * @return The next processing state, it could be either: * - 'st_literal_digit' if the literal number hasn't yet completed to be parsed. * - 'st_no_mark_found' if the literal number has completed to be parsed. */ static __attribute__((always_inline)) inline enum p_st process_literal_digit(shared_st* shared_st, literal_digit_st* digit_st, const options* opts) { enum p_st next_state = st_literal_number; // process the first digit if (digit_st->first_digit == 1 && is_token_char(shared_st->prev_char) && is_digit_char(*shared_st->q)) { // store the start position of digit literal in the result buffer for later iterations digit_st->start_pos = shared_st->res_pre_pos; // store the first digit *shared_st->res_cur_pos = *shared_st->q; digit_st->first_digit = 0; // NOTE: Don't increment the position in query buffer, as explained in 'stage_1_parsing'. } // token char or last char char is_float_char = *shared_st->q == '.' || ( tolower(shared_st->prev_char) == 'e' && ( *shared_st->q == '-' || *shared_st->q == '+' ) ); if ((is_token_char(*shared_st->q) && is_float_char == 0) || shared_st->q_len == shared_st->q_cur_pos + 1) { if (is_digit_string_2(shared_st, digit_st->start_pos, shared_st->res_cur_pos)) { shared_st->res_cur_pos = digit_st->start_pos; // place the replacement mark *shared_st->res_cur_pos++ = '?'; shared_st->prev_char = '?'; // don't copy this char if last and is not token if (is_token_char(*shared_st->q) == 0 && shared_st->q_len == shared_st->q_cur_pos + 1) { shared_st->copy_next_char = 0; // keep the same state, no token was found return next_state; } } digit_st->start_pos = NULL; digit_st->first_digit = 0; next_state = st_no_mark_found; } return next_state; } /** * @brief Alternative impl for 'NULL' replacement, unused right now. TODO: Remove. */ static __attribute__((always_inline)) inline enum p_st process_replace_null_single_chars(shared_st* shared_st, literal_null_st* null_st) { enum p_st next_st = st_replace_null; const char* null_str = "null"; if (null_st->null_pos <= 3) { if (tolower(*shared_st->q) == null_str[null_st->null_pos]) { null_st->null_pos++; } else { next_st = st_no_mark_found; } if (shared_st->q_cur_pos == shared_st->q_len - 1 && null_st->null_pos == 4) { // no need for changing the state it's the last char shared_st->copy_next_char = 0; shared_st->res_cur_pos = shared_st->res_pre_pos; // place the replacement mark *shared_st->res_cur_pos++ = '?'; shared_st->prev_char = '?'; } } else if (null_st->null_pos == 4){ if (is_token_char(*shared_st->q)) { shared_st->copy_next_char = 0; shared_st->res_cur_pos = shared_st->res_pre_pos; // place the replacement mark *shared_st->res_cur_pos++ = '?'; shared_st->prev_char = '?'; // don't copy current char, go immediately back to initial state next_st = st_no_mark_found; } } return next_st; } /** * @brief Process the 'st_replace_null' state. * @details This state processing function doesn't check if 'replace_null' feature is enabled or not. If the * feature isn't enabled, this state should never be reached. The state 'st_replace_null' is a one operation * state always, if the 'NULL' value to be replaced isn't found, processing goes back to 'st_no_mark_found' * state immediately, for this reason, this state is responsible of copying the current char before * returning. * * @param shared_st Shared state used to continue the query processing. * @param opts Options to be used for the copying of the current char. */ static __attribute__((always_inline)) inline enum p_st process_replace_null(shared_st* shared_st, const options* opts) { enum p_st next_st = st_no_mark_found; char null_found = 0; if ((shared_st->q_len - shared_st->q_cur_pos) > 4) { null_found = (*shared_st->q == 'N' || *shared_st->q == 'n') && (*(shared_st->q+1) == 'U' || *(shared_st->q+1) == 'u') && (*(shared_st->q+2) == 'L' || *(shared_st->q+2) == 'l') && (*(shared_st->q+3) == 'L' || *(shared_st->q+3) == 'l') && is_token_char(*(shared_st->q+4)); } else if ((shared_st->q_len - shared_st->q_cur_pos) == 4) { null_found = (*shared_st->q == 'N' || *shared_st->q == 'n') && (*(shared_st->q+1) == 'U' || *(shared_st->q+1) == 'u') && (*(shared_st->q+2) == 'L' || *(shared_st->q+2) == 'l') && (*(shared_st->q+3) == 'L' || *(shared_st->q+3) == 'l'); } else { null_found = 0; } if (null_found == 1) { // place the replacement mark shared_st->res_cur_pos = shared_st->res_pre_pos; *shared_st->res_cur_pos++ = '?'; shared_st->prev_char = '?'; shared_st->q += 4; shared_st->q_cur_pos += 4; } else { // process the first char and continue copy_next_char(shared_st, opts); } return next_st; } /** * @brief Gets the 'digest_end' position to be used as the end of character iteration for the currently * processed stage. * @details If the stage being processed is a 'compression' stage, i.e, it isn't 'stage 1'. The end of the * digest for performing the compression *could be* neither the final position in which 'stage 1' * finalized or the end of the buffer being used to write the digest. If 'stage 1' was parsing a number, * the position used for the end of the compression stage shall be the position of the starting digit in * the number being parsed marked by 'stage_1_st->literal_digit_st.start_pos'. * * @param shared_st Shared state used to continue the query processing. * @param stage_1_st The 'stage 1' state used to decide which will be the 'digest_end' position for the * current stage. */ static __attribute__((always_inline)) inline char* get_stage_digest_end(shared_st* shared_st, stage_1_st* stage_1_st) { char* digest_end = NULL; if (shared_st->st == st_literal_number && stage_1_st->literal_digit_st.start_pos != NULL) { digest_end = stage_1_st->literal_digit_st.start_pos - 1; } else { digest_end = shared_st->res_cur_pos - 1; } return digest_end; } /** * @brief Sets the new starting position for the current stage being processed. * @details Sets the new starting position for the current stage to be the supplied 'next_start_pos', only * if some boundary conditions hold, otherwise, it sets 'res_init_pos' as the new starting position. * * Extra details: * If the current stages processing iteration isn't the first one, the previous iteration of the * current stage to be processed already performed a compression till a certain position. Iterating the * whole result buffer again in this iteration is pointless, since most of the buffer should have been * already compressed by this stage. * * @param shared_st Shared state used to continue the query processing. * @param stage_1_st The 'stage 1' state used to decide which will be the 'digest_end' position for the * current stage. */ static __attribute__((always_inline)) inline void set_stage_next_start_pos(shared_st* shared_st, char* digest_end, char* next_start_pos) { bool initial_it = shared_st->res_init_pos == shared_st->res_it_init_pos; bool valid_next_start_pos = next_start_pos >= shared_st->res_init_pos && next_start_pos < digest_end; if (initial_it == 0 && valid_next_start_pos) { shared_st->res_cur_pos = next_start_pos; shared_st->res_pre_pos = next_start_pos; } else { shared_st->res_cur_pos = shared_st->res_init_pos; shared_st->res_pre_pos = shared_st->res_init_pos; } } /** * @brief Finalizes the compression stage and updates the supplied stage iteration final position 'stage_pre_it_pos'. * @details Copies the required characters beyond stage 'digest_end' that haven't been processed by the * compression stage, like for example, when 'stage 1' was interrupted parsing a digit because the result * buffer run out of memory. * * @param shared_st Shared state used to continue the query processing. * @param digest_end The computed 'digest_end' for the stage being processed. * @param stage_1_st The state from the previous iteration of 'stage 1'. * @param stage_pre_it_pos Pointer to be updated with the final position being processed for compression by * the stage. */ static __attribute__((always_inline)) inline void end_compression_stage_it(shared_st* shared_st, char* digest_end, stage_1_st* stage_1_st, char** stage_pre_it_pos) { if (digest_end == stage_1_st->literal_digit_st.start_pos - 1 && stage_1_st->new_end_pos) { char* f_digits = stage_1_st->literal_digit_st.start_pos; stage_1_st->literal_digit_st.start_pos = shared_st->res_pre_pos; *stage_pre_it_pos = stage_1_st->literal_digit_st.start_pos; while (f_digits < stage_1_st->new_end_pos) { *shared_st->res_pre_pos++ = *f_digits++; shared_st->res_cur_pos++; } *shared_st->res_pre_pos = 0; stage_1_st->new_end_pos = shared_st->res_pre_pos; } else { *shared_st->res_pre_pos = 0; *stage_pre_it_pos = shared_st->res_pre_pos; } } /** * @brief Performs the first stage parsing. This stage replaces values and extra spaces from the query, and * extracts any 'first comment' found within it. * @details This parsing stage is responsible for replacing the following elements from the query: * - String literals. * - Number literals: Hexadecimal, scientific notation, floating point numbers, regular numbers. * - NULL literals, if option 'replace_nulls' is supplied. * - Comments of any class; the first comment found of type '/\**\/' should be retrieved via 'fst_cmnt' * parameter. If the comment is a 'cmd' comment, it should be copied into the query digest instead of * being ignored. * - Leading spaces and double spaces found. * * This stage is the unique stage that performs copy of the characters being processed, all the other stages * perform further compression on the query digest resulted after this stage initial value replacement. * * Implementation Details: * 1. The stage parsing is implemented as an main loop consuming the characters present in the supplied query * buffer, this iteration stops when all the characters have been consumed or the result buffer is * exhausted. * 2. The detection of parsing states is performed by function 'get_next_st', whenever a new parsing state * is required for parsing current and subsequent chars, the transition to that state happens immediately, * without consuming current char. * 3. The state processing functions are responsible for deciding whether or not the characters processed * during that state are copied into the resulting buffer. For states that doesn't automatically switch back * to neutral state 'st_no_mark_found', it's *not required* to consume the first digit. Since this will * automatically takes place at the end of the current iteration. * * @param shared_st Shared state used to continue the query processing. * @param stage_1_st The first stage state to be updated. * @param opts Options used to homogenize queries via 'lowercase' or 'replace_nulls' options. * @param fst_cmnt Pointer to be updated with the found first comment, left unmodified otherwise. */ static __attribute__((always_inline)) inline void stage_1_parsing(shared_st* shared_st, stage_1_st* stage_1_st, const options* opts, char** fst_cmnt) { // state required between different iterations of special parsing states char* res_final_pos = shared_st->res_init_pos + shared_st->d_max_len - 1; cmnt_type_1_st* const cmnt_type_1_st = &stage_1_st->cmnt_type_1_st; literal_string_st* const literal_str_st = &stage_1_st->literal_str_st; literal_digit_st* const literal_digit_st = &stage_1_st->literal_digit_st; // starting state can belong to a previous iteration enum p_st cur_st = shared_st->st; // if the previous iteration was parsing a number if (stage_1_st->new_end_pos != NULL) { shared_st->res_cur_pos = stage_1_st->new_end_pos; shared_st->res_pre_pos = stage_1_st->new_end_pos; } // NOTE: Required for 'digest_corner_cases_3.hjson' // Space detection can fail when comming from another iteration if 'prev_char' is not reset. // This can allow to copy the null terminator due to the logic in 'double spaces' supression. if (shared_st->res_init_pos != shared_st->res_it_init_pos) { shared_st->prev_char = *(shared_st->res_pre_pos - 1); } // Stop when either: // 1. There is no more room left the result buffer. // 2. The final position of the received query has been reached. while (shared_st->res_cur_pos <= res_final_pos && shared_st->q_cur_pos < shared_st->q_len) { if (cur_st == st_no_mark_found) { // update the last position over the return buffer to be the current position shared_st->res_pre_pos = shared_st->res_cur_pos; cur_st = get_next_st(opts, shared_st); // if next st isn't 'no_mark_found' transition to it without consuming current char if (cur_st != st_no_mark_found) { continue; } else { // generic space removal operations // ================================ // Removal of spaces that doesn't belong to any particular parsing state. // ignore all the leading spaces if (shared_st->res_cur_pos == shared_st->res_init_pos && is_space_char(*shared_st->q)) { shared_st->q++; shared_st->q_cur_pos++; continue; } // suppress all the double spaces. // ============================== // // The suppression is performed using the address of the second space found as the // pivoting point for further space suppression in the result buffer: // // ``` // Q: `SELECT\s\s 1` // ^ address used to be replaced by next char // ``` if (is_space_char(shared_st->prev_char) && is_space_char(*shared_st->q)) { // if current position in result buffer is the first space found, we move to the next // position, in order to respect the first space char. if (!is_space_char(*(shared_st->res_cur_pos-1))) { shared_st->res_cur_pos++; } shared_st->prev_char = ' '; *shared_st->res_cur_pos = ' '; shared_st->q++; shared_st->q_cur_pos++; continue; } // copy the current char copy_next_char(shared_st, opts); } } else { if (cur_st == st_cmnt_type_1) { // by default, we don't copy the next char for comments shared_st->copy_next_char = 0; cur_st = process_cmnt_type_1(opts, shared_st, cmnt_type_1_st, fst_cmnt); if (cur_st == st_no_mark_found) { shared_st->copy_next_char = 1; continue; } } else if (cur_st == st_cmnt_type_2) { shared_st->copy_next_char = 0; cur_st = process_cmnt_type_2(shared_st); if (cur_st == st_no_mark_found) { shared_st->copy_next_char = 1; continue; } } else if (cur_st == st_cmnt_type_3) { shared_st->copy_next_char = 0; cur_st = process_cmnt_type_3(shared_st); if (cur_st == st_no_mark_found) { shared_st->copy_next_char = 1; continue; } } else if (cur_st == st_literal_string) { // NOTE: Not required to copy since spaces are not going to be processed here shared_st->copy_next_char = 0; cur_st = process_literal_string(shared_st, literal_str_st); if (cur_st == st_no_mark_found) { shared_st->copy_next_char = 1; continue; } } else if (cur_st == st_literal_number) { shared_st->copy_next_char = 1; cur_st = process_literal_digit(shared_st, literal_digit_st, opts); if (cur_st == st_no_mark_found) { literal_digit_st->first_digit = 1; shared_st->copy_next_char = 1; continue; } } else if (cur_st == st_replace_null) { // shared_st->copy_next_char = 1; cur_st = process_replace_null(shared_st, opts); if (cur_st == st_no_mark_found) { // literal_null_st.null_pos = 0; shared_st->copy_next_char = 1; continue; } } if (shared_st->copy_next_char) { copy_next_char(shared_st, opts); } else { inc_proc_pos(shared_st); } } } // place the final null terminator *shared_st->res_cur_pos = 0; shared_st->st = cur_st; // store final state position stage_1_st->pre_it_pos = shared_st->res_cur_pos; // if stage isn't finished parsing an element, set the current parsing position at which the last // element was copied. if (shared_st->st == st_literal_number) { stage_1_st->new_end_pos = shared_st->res_cur_pos; } else { stage_1_st->new_end_pos = NULL; } } /** * @brief Performs the second stage parsing. This stage is is already a compression stage responsible of * removing the following patterns: * - Spaces after '(', and before ')'. * - Spaces before and after arithmetic operators. * - Removal of (+|-) when acting on a single value. * - When enabled, via 'mysql_thread___query_digests_no_digits', removal of digits that aren't literals. * * @param shared_st Shared state used to continue the query processing. * @param stage_1_st The state resulting from the previous execution of 'stage 1'. * @param stage_2_st The state from previous execution of 'stage 2' to be udpated. * @param opts Options used for deciding wether or not enabling digits replacement. */ static __attribute__((always_inline)) inline void stage_2_parsing(shared_st* shared_st, stage_1_st* stage_1_st, stage_2_st* stage_2_st, const options* opts) { char* digest_end = get_stage_digest_end(shared_st, stage_1_st); // Compute the starting point for the second stage. The offset chosen of (5 + 1) is derived from the // pattern: `? + ddd` where 'd' stands for 'digit'. This pattern could take place in case the first // stage was interrupted while parsing a digit. // // previous_iteration: // // ``` // `,? + d` // ^ first digit 'stage_1' position && last 'stage_2' compression pos // ``` // // next_iteration: // // ``` // `,? + d` // ^ next_start pos // ``` // // Using an offset of at least `6` should prevent missing patterns in this current iteration. char* next_start_pos = stage_2_st->pre_it_pos - (shared_st->gl_c_offset - stage_2_st->c_offset) - (5 + 1); set_stage_next_start_pos(shared_st, digest_end, next_start_pos); // second stage: Space and (+|-) replacement while (shared_st->res_cur_pos <= digest_end) { if (*shared_st->res_cur_pos == ' ') { char lc = '0'; if (shared_st->res_cur_pos > shared_st->res_init_pos) { lc = *(shared_st->res_cur_pos-1); } char rc = *(shared_st->res_cur_pos+1); if (lc == '(' || rc == ')') { shared_st->res_cur_pos++; } else if ((is_arithmetic_op(lc) && rc == '?') || lc == ',' || rc == ',') { char llc = '0'; if (shared_st->res_cur_pos > shared_st->res_init_pos + 1) { llc = *(shared_st->res_cur_pos-2); } if (opts->keep_comment && (llc == '*' && lc == '/')) { *shared_st->res_pre_pos++ = *shared_st->res_cur_pos++; } else { shared_st->res_cur_pos++; } } else if (is_arithmetic_op(rc) && lc == '?' && is_token_char(lc)) { shared_st->res_cur_pos++; } else { *shared_st->res_pre_pos++ = *shared_st->res_cur_pos++; } } else if (*shared_st->res_cur_pos == '+' || *shared_st->res_cur_pos == '-') { char llc = '0'; if (shared_st->res_cur_pos > shared_st->res_init_pos + 1) { llc = *(shared_st->res_cur_pos-2); } char lc = '0'; if (shared_st->res_cur_pos > shared_st->res_init_pos) { lc = *(shared_st->res_cur_pos-1); } char rc = *(shared_st->res_cur_pos+1); // patterns to cover: // - ? + ? // - ?,+? // - c +? // - c + ? // - c+ ? // - c+? // - c, + ? if (lc == ' ') { if (is_normal_char(llc)) { shared_st->res_cur_pos++; } else if (is_token_char(llc) && (llc != '?' && llc != ')') && (rc == '?' || rc == ' ')) { shared_st->res_cur_pos++; } else { *shared_st->res_pre_pos++ = *shared_st->res_cur_pos++; } } else { if (is_token_char(lc) && (lc != '?' && lc != ')') && (rc == '?' || rc == ' ')) { shared_st->res_cur_pos++; } else { *shared_st->res_pre_pos++ = *shared_st->res_cur_pos++; } } } else if (opts->replace_number == 1 && is_digit_char(*shared_st->res_cur_pos) ) { if (shared_st->res_pre_pos > shared_st->res_init_pos && *(shared_st->res_pre_pos-1) != '?') { *shared_st->res_pre_pos++ = '?'; } shared_st->res_cur_pos++; } else { *shared_st->res_pre_pos++ = *shared_st->res_cur_pos++; } } // store this iteration position and compute the compression offset int c_2_offset = digest_end - shared_st->res_pre_pos + 1; stage_2_st->c_offset = c_2_offset > 0 ? c_2_offset : 0; end_compression_stage_it(shared_st, digest_end, stage_1_st, &stage_2_st->pre_it_pos); shared_st->res_cur_pos = shared_st->res_pre_pos; } /** * @brief Performs the third stage compression. This stage is a compression stage responsible for collapsing * the value grouping pattern like '(?,?,?)' into '(?,...)' using the config value given by * 'mysql_thread___query_digests_grouping_limit'. * * @param shared_st Shared state used to continue the query processing. * @param stage_1_st The state resulting from the previous execution of 'stage 1'. * @param stage_3_st The state from previous execution of 'stage 2' to be updated. * @param opts Options used for deciding how to perform the group collapsing. */ static __attribute__((always_inline)) inline void stage_3_parsing(shared_st* shared_st, stage_1_st* stage_1_st, stage_3_st* stage_3_st, const options* opts) { if (opts->grouping_limit == 0) { return; } // compute the 'digest_end' for the stage 3 char* digest_end = get_stage_digest_end(shared_st, stage_1_st); // Compute the starting point for the third stage. The 'min_group_size' value is obtained from // the following pattern: // // previous_iteration - after 'stage_3': // // ``` // `(?,?,?,?,?,?,...+ d` // ^ ^ first digit 'stage_1' position && last 'stage_3' compression pos // | required min 'next_start_pos' for next 'stage_3' // ``` // // The break down is: // * opts->grouping_limit*2: Maximum number of characters of groups left. // * 7: sizeof('...+ ') + sizeof('(') + 1. // int min_group_size = opts->grouping_limit*2 + 7; char* next_start_pos = stage_3_st->pre_it_pos - (shared_st->gl_c_offset - stage_3_st->c_offset) - (min_group_size + 1); set_stage_next_start_pos(shared_st, digest_end, next_start_pos); char group_candidate = 0; // it's a fixed pattern, we can perform a lookahead replacement while (shared_st->res_cur_pos <= digest_end) { // If this isn't the first iteration, it's possible to found an expansion pack '...' that is followed // by characters copied in 'stage_1' during this iteration: // // ``` // `(?,?,?,?,?,?,...,?,?)` // ^ last 'stage_3' compression pos, followed by new: `,?,?)` // ``` if (group_candidate == 1 && (shared_st->res_pre_pos - shared_st->res_init_pos) > 4) { char found_exp_pack = *(shared_st->res_pre_pos-1) == '.' && *(shared_st->res_pre_pos-2) == '.' && *(shared_st->res_pre_pos-3) == '.' && *(shared_st->res_pre_pos-4) == ','; if (found_exp_pack == 1 && ((digest_end - shared_st->res_cur_pos) >= 1)) { // collapse new patterns founds after the expansion char* new_cur_pos = shared_st->res_cur_pos; bool is_last = 0; // if the first character is a ',' we skip it to count the '?,' patterns if (*new_cur_pos == ',') { new_cur_pos += 1; } while ((new_cur_pos < digest_end)) { if (*new_cur_pos == '?' && *(new_cur_pos+1) == ',') { new_cur_pos += 2; } else { if (*new_cur_pos == '?' && *(new_cur_pos+1) == ')') { new_cur_pos += 1; is_last = 1; } break; } } // We update the current position if either: // * At least one '?,' was found. // * The final pattern '?)' was found. if ((new_cur_pos > shared_st->res_cur_pos + 1) || is_last) { shared_st->res_cur_pos = new_cur_pos; } // If the first stage hasn't finished parsing a number literal, the following situation is // possible, since we previously skipped the found ',': // // ``` // `(?,?,?,...,dddd)` // ^ new_cur_pos // ``` // // In this case, we break to avoid copying the last char. That copy should be performed by // `end_compression_stage_it`. if (stage_1_st->literal_digit_st.start_pos) { if (new_cur_pos >= digest_end && is_digit_char(*new_cur_pos)) { break; } } } } char* cur_char = shared_st->res_cur_pos; char pattern_fits = shared_st->res_cur_pos < digest_end - opts->grouping_limit*2; if (group_candidate == 1 && pattern_fits) { // NOTE: Minimal viable pattern for replacement is the starting point: '?,?,'. // This pattern also matches the size of a 32bit register, so probably will only // take one comparison for matching it. This removes a lot of false cases matching the first // '?', or '?,' that could be found in column names when digit replacement is performed. char is_min_pattern = *cur_char == '?' && *(cur_char+1) == ',' && *(cur_char+2) == '?' && (*(cur_char+3) == ',' || *(cur_char+3) == ')'); // The pattern to match shouldn't be preceded by an arithmetic operator, otherwise, patterns // like this '?+?,?,?' could start counting from the first match of '?,', which shouldn't be // the case. if (is_arithmetic_op(*(cur_char-1)) == 0 && is_min_pattern) { int pattern_len = 0; char pattern_broken = 0; char* pattern_pos = shared_st->res_cur_pos; while ((pattern_pos < digest_end) && pattern_broken == 0) { if (*pattern_pos == '?' && *(pattern_pos+1) == ',') { pattern_pos += 2; pattern_len += 1; } else { if (*(pattern_pos+1) == ')') { pattern_broken = 2; } else { pattern_broken = 1; } } } // in case of the final pattern being '?)', we need to count the '?' as being replaced for // the grouping for matching replacements of the exact length. int f_pattern_len = pattern_broken == 2 ? pattern_len * 2 + 1 : pattern_len * 2; if (f_pattern_len >= (opts->grouping_limit * 2 + 3)) { for (int i = 0; i < pattern_len; i++) { if (i < opts->grouping_limit) { *shared_st->res_pre_pos++ = '?'; *shared_st->res_pre_pos++ = ','; } else if (i == opts->grouping_limit) { *shared_st->res_pre_pos++ = '.'; *shared_st->res_pre_pos++ = '.'; *shared_st->res_pre_pos++ = '.'; } } // we jump over the final '?' in case the final pattern was '?)' if (pattern_broken == 2) { shared_st->res_cur_pos = pattern_pos + 1; } else { shared_st->res_cur_pos = pattern_pos - 1; } } else { for (int i = 0; i < pattern_len; i++) { *shared_st->res_pre_pos++ = '?'; *shared_st->res_pre_pos++ = ','; } // Update the current position to the position where pattern was broken shared_st->res_cur_pos = pattern_pos; } } else { *shared_st->res_pre_pos++ = *shared_st->res_cur_pos++; } } else { *shared_st->res_pre_pos++ = *shared_st->res_cur_pos++; } // grouping candidates always start with '(' if (*cur_char == '(') { group_candidate = 1; } else if (*cur_char == ')') { group_candidate = 0; } } int c_3_offset = digest_end - (shared_st->res_pre_pos - 1); stage_3_st->c_offset = c_3_offset > 0 ? c_3_offset : 0; end_compression_stage_it(shared_st, digest_end, stage_1_st, &stage_3_st->pre_it_pos); shared_st->res_cur_pos = shared_st->res_pre_pos; } /** * @brief Check if there is a group pattern of kind '(?,?,?)' following the supplied position. * * @param pos The starting pattern position. The initial '('. * @param opts Options used to compute the pattern length. * * @return '1' if a pattern has been found, '0' otherwise. */ static inline bool is_group_pattern(const char* pos, const options* opts) { int group_size = (1 + opts->grouping_limit*2 + 3 + 1); bool is_group_pattern = 1; int i = 0; for (i = 0; i < group_size; i++) { if (i == 0) { if (*pos != '(') { is_group_pattern = 0; break; } } else if (i == group_size - 1) { if (*(pos + i) != ')') { is_group_pattern = 0; break; } } else if (i % 2 == 1) { if (i <= opts->grouping_limit * 2) { if (*(pos + i) != '?') { is_group_pattern = 0; break; } } else { if (*(pos + i) != '.') { is_group_pattern = 0; break; } } } else { if (i <= opts->grouping_limit * 2) { if (*(pos + i) != ',') { is_group_pattern = 0; break; } } else { if (*(pos + i) != '.') { is_group_pattern = 0; break; } } } } return is_group_pattern; } /** * @brief Performs the fourth stage compression. This stage is a compression stage responsible for collapsing * the value grouping patterns already compressed by 'stage 3' into a more compact representation, e.g: * * Pattern: * * ``` * (?,?,...),(?,?,...),(?,?,...),(?,?,...),(?,?,...) * ``` * * For 'mysql_thread___query_digests_groups_grouping_limit=3' would be compressed into: * * ``` * (?,?,...),(?,?,...),(?,?,...),... * ``` * * @param shared_st Shared state used to continue the query processing. * @param stage_1_st The state resulting from the previous execution of 'stage 1'. * @param stage_4_st The state from previous execution of 'stage 4' to be updated. * @param opts Options used for deciding how to perform the group collapsing. */ static __attribute__((always_inline)) inline void stage_4_parsing(shared_st* shared_st, stage_1_st* stage_1_st, stage_4_st* stage_4_st, const options* opts) { if (opts->groups_grouping_limit == 0 || opts->grouping_limit == 0) { return; } char* digest_end = get_stage_digest_end(shared_st, stage_1_st); // '( + ?,?,n + ... + ') ,' int group_pattern_size = (1 + opts->grouping_limit*2 + 3 + 1 + 1); // Compute the starting point for the fourth stage. Since the previous iteration could have ended in a // non-collapsed chain of patterns (if the expanded version of last pattern didn't fit in the buffer). The // last position from the previous iteration could be: // // * 'mysql_thread___query_digests_grouping_limit': 2 // * 'mysql_thread___query_digests_groups_grouping_limit': 6 // // ``` // (?,?,...),(?,?,...),(?,?,...),(?,?,...),(?,?,...),(?,?,...),(?,?,+d) // ^ last position // ``` // // Since the '7' pattern was never found, no collapsing took place, so in order to ensure that we lie // behind the whole pattern for this iteration we use the offset: // // ``` // (group_pattern_size * (opts->groups_grouping_limit + 2)) // ``` char* next_start_pos = stage_4_st->pre_it_pos - (group_pattern_size * (opts->groups_grouping_limit + 2)); // compute the starting point for the fourth stage set_stage_next_start_pos(shared_st, digest_end, next_start_pos); // it's a fixed pattern, we can perform a lookahead replacement while (shared_st->res_cur_pos <= digest_end) { char* cur_char = shared_st->res_cur_pos; if ((shared_st->res_pre_pos - shared_st->res_init_pos) > 5) { char found_exp_pack = *(shared_st->res_pre_pos-1) == '.' && *(shared_st->res_pre_pos-2) == '.' && *(shared_st->res_pre_pos-3) == '.' && *(shared_st->res_pre_pos-4) == ',' && *(shared_st->res_pre_pos-5) == ')'; if (found_exp_pack == 1) { char* cur_pattern_pos = cur_char; int found_group_patterns = 0; // Jump over the found comma or space, same as in 'stage_3'. For a specific case regarding // the space see 'digest_corner_cases_2.hjson' payload. if (*cur_pattern_pos == ',' || *cur_pattern_pos == ' ') { cur_pattern_pos += 1; } while(cur_pattern_pos + (group_pattern_size - 2) <= digest_end) { if (is_group_pattern(cur_pattern_pos, opts) == 1) { if (cur_pattern_pos + (group_pattern_size - 1) == digest_end) { cur_pattern_pos += group_pattern_size - 1; } else { cur_pattern_pos += group_pattern_size; } found_group_patterns += 1; } else { break; } } if (cur_pattern_pos > shared_st->res_cur_pos + 1) { shared_st->res_cur_pos = cur_pattern_pos - 1; continue; } if (cur_pattern_pos >= digest_end) { break; } } } char pattern_fits = shared_st->res_cur_pos <= // NOTE: Final '+ 1' due to repeating comma in the pattern not in the final case, and the // fact that digest_end is the final character, which is part of the pattern (digest_end - (group_pattern_size * (opts->groups_grouping_limit + 1)) + 2); // fast check for knowing that this can potentially be a group pattern if (pattern_fits && *cur_char == '(' && *(cur_char+1) == '?' && *(cur_char+2) == ',') { char* pattern_start = cur_char; char* cur_pattern_pos = cur_char; int found_group_patterns = 0; while(cur_pattern_pos + (group_pattern_size - 2) <= digest_end) { if (is_group_pattern(cur_pattern_pos, opts) == 1) { cur_pattern_pos += group_pattern_size - 1; if (*cur_pattern_pos == ',') { cur_pattern_pos++; } found_group_patterns += 1; } else { break; } } // count found forward patterns if (found_group_patterns > opts->groups_grouping_limit) { memmove(shared_st->res_pre_pos, pattern_start, (long) group_pattern_size * opts->groups_grouping_limit); shared_st->res_pre_pos += group_pattern_size * opts->groups_grouping_limit; *shared_st->res_pre_pos++ = '.'; *shared_st->res_pre_pos++ = '.'; *shared_st->res_pre_pos++ = '.'; shared_st->res_cur_pos = cur_pattern_pos; } } if (shared_st->res_cur_pos > digest_end) { break; } else { *shared_st->res_pre_pos++ = *shared_st->res_cur_pos++; } } int c_4_offset = digest_end - (shared_st->res_pre_pos - 1); stage_4_st->c_offset = c_4_offset > 0 ? c_4_offset : 0; end_compression_stage_it(shared_st, digest_end, stage_1_st, &stage_4_st->pre_it_pos); shared_st->res_cur_pos = shared_st->res_pre_pos; } /** * @brief Final stage, reponsible of performing final cleanups to the digest after the rest of the processing * is performed, at the moment it peforms: * * * Final space replacement. * * Trimmed digits replacement. * * @param shared_st Shared state used to continue the query processing. * @param stage_1_st Stage 1 final state, used for the trimmed digits replacement. * @param opts Options, currently unused. */ static __attribute__((always_inline)) inline void final_stage(shared_st* shared_st, stage_1_st* stage_1_st, const options* opts) { // Simple final cleanup for making queries more homogeneous when trimmed. // Since literal number processing requires the copy of the literal into the output buffer, processing // could finish before a number is completely parsed, due to compression non being able to create enough // room to complete the processing. In this case, it's possible to have digest ending like: // // ``` // INSERT INTO db.table pi_value VALUES (3.141592 // ^ end because no room for parsing all the digits // ``` // // In this case a final effor is performed to homogenize the query, replacing the literal by '?': // // ``` // INSERT INTO db.table pi_value VALUES (? // ^ replaced literal // ``` if (stage_1_st->literal_digit_st.start_pos != NULL) { if (shared_st->d_max_len <= (shared_st->res_cur_pos - shared_st->res_init_pos)) { if (shared_st->st == st_literal_number && is_digit_char(*stage_1_st->literal_digit_st.start_pos)) { *stage_1_st->literal_digit_st.start_pos++ = '?'; *stage_1_st->literal_digit_st.start_pos = '\0'; } } } // Remove all trailing whitespaces and semicolons // ============================================== // // - Final spaces left by comments which are never collapsed, ex: // // ``` // Q: `select 1.1 -- final_comment \n` // D: `select ? ` // ^ never collapsed // ``` // // - Semicolons (';') marking the end of the query are also removed. { // v1_crashing_payload_06 char* f_char = shared_st->res_cur_pos - 1; while (f_char > shared_st->res_init_pos && (*f_char == ' ' || *f_char == ';')) { f_char--; } f_char++; *f_char = '\0'; // NOTE: Since this is the last operation this isn't really required. But it's left in case this block // is moved in the future. shared_st->res_cur_pos = f_char; } } /** * @brief Helper function for testing 'first_stage' digest parsing. * * @param q Query to be parsed. * @param q_len Length of the supplied queried. * @param fst_cmnt First comment to be filled in case of being found in the query. * @param buf Buffer to be used for writing the resulting digest. * * @return The processed digest. Caller is responsible from freeing if buffer wasn't provided. */ char* mysql_query_digest_first_stage(const char* const q, int q_len, char** const fst_cmnt, char* const buf) { /* buffer to store first comment. */ int d_max_len = get_digest_max_len(q_len, mysql_thread___query_digests_max_query_length); char* res = get_result_buffer(d_max_len, buf); // global options options opts; get_mysql_options(&opts); // state shared between all the parsing states struct shared_st shared_st; memset(&shared_st, 0, sizeof(struct shared_st)); init_shared_st(&shared_st, q, q_len, d_max_len, res); struct stage_1_st stage_1_st; memset(&stage_1_st, 0, sizeof(struct stage_1_st)); init_stage_1_st(&stage_1_st); // perform just the first stage parsing stage_1_parsing(&shared_st, &stage_1_st, &opts, fst_cmnt); final_stage(&shared_st, &stage_1_st, &opts); return res; } /** * @brief Helper function for testing 'second_stage' digest parsing. * * @param q Query to be parsed. * @param q_len Length of the supplied queried. * @param fst_cmnt First comment to be filled in case of being found in the query. * @param buf Buffer to be used for writing the resulting digest. * * @return The processed digest. Caller is responsible from freeing if buffer wasn't provided. */ char* mysql_query_digest_second_stage(const char* const q, int q_len, char** const fst_cmnt, char* const buf) { /* buffer to store first comment. */ int d_max_len = get_digest_max_len(q_len, mysql_thread___query_digests_max_query_length); char* res = get_result_buffer(d_max_len, buf); // global options options opts; get_mysql_options(&opts); // state shared between all the parsing states struct shared_st shared_st; memset(&shared_st, 0, sizeof(struct shared_st)); init_shared_st(&shared_st, q, q_len, d_max_len, res); struct stage_1_st stage_1_st; memset(&stage_1_st, 0, sizeof(struct stage_1_st)); init_stage_1_st(&stage_1_st); struct stage_2_st stage_2_st; memset(&stage_2_st, 0, sizeof(struct stage_2_st)); // perform just the first stage parsing stage_1_parsing(&shared_st, &stage_1_st, &opts, fst_cmnt); // second stage parsing stage_2_parsing(&shared_st, &stage_1_st, &stage_2_st, &opts); final_stage(&shared_st, &stage_1_st, &opts); return res; } /** * @brief Parse the supplied query and returns a query digest. Newer implementation based on different parsing * stages in order to simplify branching and processing logic: * * - First stage: Replacing of literal values and double spaces. The goal of this stage is homogenize the * query values as much as possible to reduce branching in further processing stages. * - Second stage: Replacing of extra spaces and arithmetic operators (+|-) when they are in front of a * single value. * - Third stage: Perform different supported grouping operations for the already replaced values. * * @param s The query to be parsed. * @param len The length of the received query. * @param fst_cmnt Pointer to store the fst cmnt found in the query, if any. * @param buf Buffer to use to store the digest for the supplied query, if no buffer is supplied, memory will * be allocated based on 'mysql_thread___query_digests_max_query_length' and supplied query length. * * @return A pointer to the start of the supplied buffer, or the allocated memory containing the digest. */ char* mysql_query_digest_and_first_comment(const char* const q, int q_len, char** const fst_cmnt, char* const buf, const options* opts) { #ifdef DEBUG if (buf != NULL) { memset(buf, 0, 127); } #endif /* buffer to store first comment. */ int d_max_len = get_digest_max_len(q_len, opts->max_query_length); char* res = get_result_buffer(d_max_len, buf); #ifdef DEBUG res[d_max_len] = 0; #endif // state shared between all the parsing states struct shared_st shared_st; memset(&shared_st, 0, sizeof(struct shared_st)); init_shared_st(&shared_st, q, q_len, d_max_len, res); // individual states for stages struct stage_1_st stage_1_st; memset(&stage_1_st, 0, sizeof(struct stage_1_st)); init_stage_1_st(&stage_1_st); struct stage_2_st stage_2_st; struct stage_3_st stage_3_st; struct stage_4_st stage_4_st; memset(&stage_2_st, 0, sizeof(struct stage_2_st)); memset(&stage_3_st, 0, sizeof(struct stage_3_st)); memset(&stage_4_st, 0, sizeof(struct stage_4_st)); char min_digest_size = 0; // TODO: This may requires a stopping point, configurable or not, otherwise parsing can become slow for // very big queries that will require multiple compression stages for processing them. Instead if a // maximum number of iterations is imposed, those queries will stop being parsed before the maximum // compression, but the overhead can be greatly reduced. Example of these queries can be: // // ``` // ˇ Query continues... // INSERT INTO db.table (colj-1,colk,...) VALUES (?,...),(?,...) ON DUPLICATE KEY UPDATE col1 = VALUES(col2) + VALUES(col3)') // 'n' number of values ^ ^ 'm' number of repetitions // ``` // // If 'n' and 'm' are big numbers, the number of iterations for performing the collapsing would totally be // dependent of: length(query) / max_query_digest_length. Most of this kind of query, will keep being // collapsed, since none of the iterations will fill the buffer, since all the new values will be // collapsed. Due to this, we might want to offer a way or limit to stop the iteration and offer a // trade off between compression and performance for very big queries. while (min_digest_size == 0) { stage_1_parsing(&shared_st, &stage_1_st, opts, fst_cmnt); stage_2_parsing(&shared_st, &stage_1_st, &stage_2_st, opts); stage_3_parsing(&shared_st, &stage_1_st, &stage_3_st, opts); stage_4_parsing(&shared_st, &stage_1_st, &stage_4_st, opts); // compute the compression offset of the whole iteration shared_st.gl_c_offset = stage_1_st.pre_it_pos - shared_st.res_cur_pos; if ( shared_st.q_cur_pos >= shared_st.q_len || d_max_len <= (shared_st.res_cur_pos - shared_st.res_init_pos) || shared_st.gl_c_offset == 0 ) { min_digest_size = 1; } else { // we need to update the shared state for processing again from the previous ending point char* new_start_point = shared_st.res_cur_pos; shared_st.res_it_init_pos = new_start_point; shared_st.res_cur_pos = new_start_point; shared_st.res_pre_pos = new_start_point; } } final_stage(&shared_st, &stage_1_st, opts); return res; } // For TAP tests char* mysql_query_digest_and_first_comment_2(const char* const q, int q_len, char** const fst_cmnt, char* const buf) { // global options options opts; get_mysql_options(&opts); return mysql_query_digest_and_first_comment(q, q_len, fst_cmnt, buf, &opts); } static __attribute__((always_inline)) inline enum p_st process_literal_string_space_rm(shared_st* shared_st, literal_string_st* str_st) { enum p_st next_state = st_literal_string; // process the first delimiter if (str_st->delim_num == 0) { str_st->delim_char = *shared_st->q; str_st->delim_num = 1; // TODO: Remove exp space replacement *shared_st->res_cur_pos++ = *shared_st->q; // consume the delimiter from the query shared_st->q++; shared_st->q_cur_pos++; } // need to be ignored case if(shared_st->res_cur_pos > shared_st->res_pre_pos + SIZECHAR) { if( (shared_st->prev_char == '\\' && *shared_st->q == '\\') || // to process '\\\\', '\\' (shared_st->prev_char == '\\' && *shared_st->q == str_st->delim_char) || // to process '\'' (shared_st->prev_char == str_st->delim_char && *shared_st->q == str_st->delim_char) // to process '''' ) { shared_st->prev_char = 'X'; shared_st->q++; shared_st->q_cur_pos++; return next_state; } } // satisfied closing string - swap string to ? if( *shared_st->q == str_st->delim_char && (shared_st->d_max_len == shared_st->q_cur_pos+1 || *(shared_st->q + SIZECHAR) != str_st->delim_char) ) { shared_st->res_cur_pos = shared_st->res_pre_pos; char* _p = shared_st->res_pre_pos - 3; // remove '+|-' symbols before the found literal if ( _p >= shared_st->res_init_pos && ( *(_p+2) == '-' || *(_p+2) == '+') ) { if ( ( *(_p+1) == ',' ) || ( *(_p+1) == '(' ) || ( ( *(_p+1) == ' ' ) && ( *_p == ',' || *_p == '(' ) ) ) { shared_st->res_cur_pos--; } } // remove spaces before the found literal if ( _p >= shared_st->res_init_pos && is_space_char(*(_p + 2))) { if ( ( *(_p+1) == ',' ) || ( *(_p+1) == '(' ) || ( is_arithmetic_op(*(_p+1)) ) ) { if ( _p >= shared_st->res_init_pos && ( *(_p+3) == '\''|| *(_p+3) == '"' )) { shared_st->res_cur_pos--; } } } // place the replacement mark *shared_st->res_cur_pos++ = '?'; shared_st->prev_char = '?'; // don't copy this char if last if (shared_st->d_max_len == shared_st->q_cur_pos + 1) { shared_st->copy_next_char = 0; // keep the same state, no token was found return next_state; } // reinit the string literal state str_st->delim_char = 0; str_st->delim_num = 0; // update the shared state shared_st->prev_char = str_st->delim_char; if(shared_st->q_cur_pos < shared_st->d_max_len) { shared_st->q++; } shared_st->q_cur_pos++; // exit the literal parsing state next_state = st_no_mark_found; } return next_state; } static __attribute__((always_inline)) inline enum p_st process_literal_digit_space_rm(shared_st* shared_st, literal_digit_st* digit_st, options* opts) { enum p_st next_state = st_literal_number; // consume the first digit if (digit_st->first_digit == 1 && is_token_char(*(shared_st->q-1)) && is_digit_char(*shared_st->q)) { // place the previous position at the number start *shared_st->res_cur_pos++ = *shared_st->q; digit_st->first_digit = 0; shared_st->q++; shared_st->q_cur_pos++; } // is float if ( *shared_st->q == '.' || (*shared_st->q == 'e' || *shared_st->q == 'E') || ( (*shared_st->q == '+' || *shared_st->q == '-') && (shared_st->prev_char == 'e' || shared_st->prev_char == 'E') ) ) { shared_st->prev_char = *shared_st->q; shared_st->copy_next_char = 0; return next_state; } // token char or last char if (is_token_char(*shared_st->q) || shared_st->d_max_len == shared_st->q_cur_pos + 1) { if (is_digit_string(shared_st->res_pre_pos, shared_st->res_cur_pos)) { shared_st->res_cur_pos = shared_st->res_pre_pos; char* _p = shared_st->res_pre_pos - 3; // remove symbol and keep parenthesis or comma if (_p >= shared_st->res_init_pos && ( *(_p+2) == '-' || *(_p+2) == '+') ) { if ( ( *(_p+1) == ',' ) || (*(_p+1) == '(') || ( (*(_p+1) == ' ') && (*_p == ',' || *_p == '(') ) ) { shared_st->res_cur_pos--; } } // Remove spaces before number counting with possible '.' presence if (_p >= shared_st->res_init_pos && *_p == '.' && (*(_p+1) == ' ' || *(_p+1) == '.') && (*(_p+2) == '-' || *(_p+2) == '+') ) { if (*(_p + 1) == ' ') { shared_st->res_cur_pos--; } shared_st->res_cur_pos--; } // remove spaces after a opening bracket when followed by a number if (_p >= shared_st->res_init_pos && *(_p+1) == '(' && *(_p+2) == ' ') { shared_st->res_cur_pos--; } // remove spaces before number if (_p >= shared_st->res_init_pos && is_space_char(*(_p + 2))) { // a point '.' can be found prior to a number in case of query grouping if ( _p >= shared_st->res_init_pos && (*(_p+1) == '-' || *(_p+1) == '+' || *(_p+1) == '*' || *(_p+1) == '/' || *(_p+1) == '%' || *(_p+1) == ',' || *(_p+1) == '.') ) { shared_st->res_cur_pos--; } } // place the replacement mark *shared_st->res_cur_pos++ = '?'; shared_st->prev_char = '?'; // don't copy this char if last if (shared_st->d_max_len == shared_st->q_cur_pos + 1) { shared_st->copy_next_char = 0; // keep the same state, no token was found return next_state; } } else { // collapse any digits found in the string if (opts->replace_number) { int str_len = shared_st->res_cur_pos - shared_st->res_pre_pos + 1; int collapsed = 0; for (int i = 0; i < str_len; i++) { char* const c_p_r_t = ((char*)shared_st->res_pre_pos + i); char* const n_p_r_t = ((char*)shared_st->res_pre_pos + i + 1); if (is_digit_char(*c_p_r_t) && is_digit_char(*n_p_r_t)) { memmove(c_p_r_t, c_p_r_t + 1, str_len - i); collapsed += 1; } } shared_st->res_cur_pos -= collapsed; int new_str_len = shared_st->res_cur_pos - shared_st->res_pre_pos + 1; for (int i = 0; i < new_str_len; i++) { char* const c_p_r_t = ((char*)shared_st->res_cur_pos + i); if (is_digit_char(*c_p_r_t)) { *c_p_r_t = '?'; } } } } next_state = st_no_mark_found; } return next_state; } /** * @brief Parse the supplied query and returns a query digest in just one iteration. This is an earlier * implementation than the newer one based in stages. This implementations is incomplete in the sense that * doesn't cover all the supported features in the original one. * * @param s The query to be parsed. * @param len The length of the received query. * @param fst_cmnt Pointer to store the fst cmnt found in the query, if any. * @param buf Buffer to use to store the digest for the supplied query, if no buffer is supplied, memory will * be allocated based on 'mysql_thread___query_digests_max_query_length' and supplied query length. * * @return A pointer to the start of the supplied buffer, or the allocated memory containing the digest. */ char* mysql_query_digest_and_first_comment_one_it(char* q, int q_len, char** fst_cmnt, char* buf) { #ifdef DEBUG if (buf != NULL) { memset(buf, 0, 127); } #endif int d_max_len = get_digest_max_len(q_len, mysql_thread___query_digests_max_query_length); char* res = get_result_buffer(d_max_len, buf); // global options options opts; get_mysql_options(&opts); // state shared between all the parsing states struct shared_st shared_st; memset(&shared_st, 0, sizeof(struct shared_st)); shared_st.q = q; shared_st.q_len = q_len; shared_st.d_max_len = d_max_len; shared_st.res_init_pos = res; shared_st.res_it_init_pos = res; shared_st.res_cur_pos = res; shared_st.res_pre_pos = res; // state required between different iterations of special parsing states struct cmnt_type_1_st c_t_1_st; struct literal_string_st literal_str_st; struct literal_digit_st literal_digit_st; memset(&c_t_1_st, 0, sizeof(struct cmnt_type_1_st)); memset(&literal_str_st, 0, sizeof(struct literal_string_st)); memset(&literal_digit_st, 0, sizeof(struct literal_digit_st)); enum p_st cur_st = st_no_mark_found; // start char consumption while (shared_st.q_cur_pos < d_max_len) { if (cur_st == st_no_mark_found) { // update the last position over the return buffer to be the current position shared_st.res_pre_pos = shared_st.res_cur_pos; cur_st = get_next_st(&opts, &shared_st); // if next st isn't 'no_mark_found' transition to it without consuming current char if (cur_st != st_no_mark_found) { continue; } else { // generic space removal operations // ================================ // Removal of spaces that doesn't belong to any particular parsing state. // ignore all the leading spaces if (shared_st.res_cur_pos == shared_st.res_init_pos && is_space_char(*shared_st.q)) { shared_st.q++; shared_st.q_cur_pos++; continue; } // suppress all the double spaces. // ============================== // // The suppression is performed using the address of the second space found as the // pivoting point for further space suppression in the result buffer: // // ``` // Q: `SELECT\s\s 1` // ^ address used to be replaced by next char // ``` if (is_space_char(shared_st.prev_char) && is_space_char(*shared_st.q)) { // if current position in result buffer is the first space found, we move to the next // position, in order to respect the first space char. if (!is_space_char(*(shared_st.res_cur_pos-1))) { shared_st.res_cur_pos++; } shared_st.prev_char = ' '; *shared_st.res_cur_pos = ' '; shared_st.q++; shared_st.q_cur_pos++; continue; } { char* p = shared_st.res_cur_pos - 2; // suppress spaces before arithmetic operators if (p >= shared_st.res_init_pos && is_space_char(shared_st.prev_char) && is_arithmetic_op(*shared_st.q)) { if (*p == '?') { shared_st.prev_char = *shared_st.q; --shared_st.res_cur_pos; *shared_st.res_cur_pos++ = *shared_st.q; shared_st.q++; shared_st.q_cur_pos++; continue; } } // suppress spaces before and after commas if ( p >= shared_st.res_init_pos && is_space_char(shared_st.prev_char) && ((*shared_st.q == ',') || (*p == ',')) ) { if (*shared_st.q == ',') { --shared_st.res_cur_pos; *shared_st.res_cur_pos++ = *shared_st.q; shared_st.prev_char = ','; shared_st.q++; shared_st.q_cur_pos++; } else { shared_st.prev_char = ','; --shared_st.res_cur_pos; } continue; } // suppress spaces before closing brackets when grouping or mark is present if ( p >= shared_st.res_init_pos && (*p == '.' || *p == '?') && is_space_char(shared_st.prev_char) && (*shared_st.q == ')') ) { shared_st.prev_char = *shared_st.q; --shared_st.res_cur_pos; *shared_st.res_cur_pos++ = *shared_st.q; shared_st.q++; shared_st.q_cur_pos++; continue; } } // copy the current char copy_next_char(&shared_st, &opts); } } else { if (cur_st == st_cmnt_type_1) { // by default, we don't copy the next char for comments shared_st.copy_next_char = 0; cur_st = process_cmnt_type_1(&opts, &shared_st, &c_t_1_st, fst_cmnt); if (cur_st == st_no_mark_found) { shared_st.copy_next_char = 1; continue; } } else if (cur_st == st_cmnt_type_2) { shared_st.copy_next_char = 0; cur_st = process_cmnt_type_2(&shared_st); if (cur_st == st_no_mark_found) { shared_st.copy_next_char = 1; continue; } } else if (cur_st == st_cmnt_type_3) { shared_st.copy_next_char = 0; cur_st = process_cmnt_type_3(&shared_st); if (cur_st == st_no_mark_found) { shared_st.copy_next_char = 1; continue; } } else if (cur_st == st_literal_string) { shared_st.copy_next_char = 1; cur_st = process_literal_string_space_rm(&shared_st, &literal_str_st); if (cur_st == st_no_mark_found) { shared_st.copy_next_char = 1; continue; } } else if (cur_st == st_literal_number) { shared_st.copy_next_char = 1; cur_st = process_literal_digit_space_rm(&shared_st, &literal_digit_st, &opts); if (cur_st == st_no_mark_found) { literal_digit_st.first_digit = 1; shared_st.copy_next_char = 1; continue; } } if (shared_st.copy_next_char) { copy_next_char(&shared_st, &opts); } else { // if we do not copy we skip the next char, but copy it to `prev_char` shared_st.prev_char = *shared_st.q++; shared_st.q_cur_pos++; } } } // remove all trailing whitespaces // =============================== // // Final spaces left by comments which are never collapsed, ex: // // ``` // Q: `select 1.1 -- final_comment \n` // D: `select ? ` // ^ never collapsed // ``` if (shared_st.res_cur_pos > shared_st.res_it_init_pos) { char* wspace = shared_st.res_cur_pos - 1; while (*wspace == ' ') { wspace--; } wspace++; *wspace = '\0'; } // place the final null terminator *shared_st.res_cur_pos = 0; return res; } char* mysql_query_strip_comments(char *s, int _len, bool lowercase) { int i = 0; int len = _len; char *r = (char *) malloc(len + SIZECHAR); char *p_r = r; char *p_r_t = r; char prev_char = 0; char flag = 0; char fns=0; while(i < len) { // ================================================= // START - read token char and set flag what's going on. // ================================================= if(flag == 0) { // store current position p_r_t = p_r; // comment type 1 - start with '/*' if(prev_char == '/' && *s == '*') { flag = 1; } // comment type 2 - start with '#' else if(*s == '#') { flag = 2; } // comment type 3 - start with '--' else if(prev_char == '-' && *s == '-' && ((*(s+1)==' ') || (*(s+1)=='\n') || (*(s+1)=='\r') || (*(s+1)=='\t') )) { flag = 3; } // not above case - remove duplicated space char else { flag = 0; if (fns==0 && is_space_char(*s)) { s++; i++; continue; } if (fns==0) fns=1; if(is_space_char(prev_char) && is_space_char(*s)){ prev_char = ' '; *p_r = ' '; s++; i++; continue; } } } // ================================================= // PROCESS and FINISH - do something on each case // ================================================= else { // -------- // comment // -------- if( // comment type 1 - /* .. */ (flag == 1 && prev_char == '*' && *s == '/') || // comment type 2 - # ... \n (flag == 2 && (*s == '\n' || *s == '\r' || (i == len - 1) )) || // comment type 3 - -- ... \n (flag == 3 && (*s == '\n' || *s == '\r' || (i == len -1) )) ) { p_r = p_r_t; if (flag == 1 || (i == len -1)) { p_r -= SIZECHAR; } prev_char = ' '; flag = 0; s++; i++; continue; } } // ================================================= // COPY CHAR // ================================================= // convert every space char to ' ' if (lowercase==false) { *p_r++ = !is_space_char(*s) ? *s : ' '; } else { *p_r++ = !is_space_char(*s) ? (tolower(*s)) : ' '; } prev_char = *s++; i++; } // remove a trailing space if (p_r>r) { char *e=p_r; e--; if (*e==' ') { *e=0; } } *p_r = 0; return r; }