You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
proxysql/lib/pgsql_tokenizer.cpp

3325 lines
105 KiB

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include "c_tokenizer.h"
extern __thread int pgsql_thread___query_digests_max_query_length;
extern __thread bool pgsql_thread___query_digests_lowercase;
extern __thread bool pgsql_thread___query_digests_replace_null;
extern __thread bool pgsql_thread___query_digests_no_digits;
extern __thread bool pgsql_thread___query_digests_grouping_limit;
extern __thread bool pgsql_thread___query_digests_groups_grouping_limit;
extern __thread bool pgsql_thread___query_digests_keep_comment;
#define SIZECHAR sizeof(char)
// check char if it could be table name
static inline char is_normal_char(char c)
{
if(c >= 'a' && c <= 'z')
return 1;
if(c >= 'A' && c <= 'Z')
return 1;
if(c >= '0' && c <= '9')
return 1;
if (c == '$' || c == '_')
return 1;
return 0;
}
// token char - not table name string
static inline char is_token_char(char c)
{
return !is_normal_char(c);
}
// space - it's much easy to remove duplicated space chars
static inline char is_space_char(char c)
{
if(c == ' ' || c == '\t' || c == '\n' || c == '\r')
return 1;
return 0;
}
// check digit
static inline char is_digit_char(char c)
{
if(c >= '0' && c <= '9')
return 1;
return 0;
}
// check if it can be HEX char
static inline char is_hex_char(char c)
{
if((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))
return 1;
return 0;
}
// between pointer, check string is number - need to be changed more functions
// TODO: f-1 shouldn't be access if 'f' is the first position supplied, could lead to
// buffer overflow. NOTE: This is now addressed by 'is_digit_string_2'.
static char is_digit_string(char *f, char *t)
{
if(f == t)
{
if(is_digit_char(*f))
return 1;
else
return 0;
}
int is_hex = 0;
int i = 0;
// 0x, 0X, n.m, nE+m, nE-m, Em
while(f != t)
{
char is_float =
*f == '.' || tolower(*f) == 'e' ||
(tolower(*(f-1)) == 'e' && (*f == '+' || *f == '-'));
if(i == 1 && *(f-1) == '0' && (*f == 'x' || *f == 'X'))
{
is_hex = 1;
}
// none hex
else if(!is_hex && !is_digit_char(*f) && is_float == 0)
{
return 0;
}
// hex
else if(is_hex && !is_hex_char(*f))
{
return 0;
}
f++;
i++;
}
return 1;
}
static inline char is_arithmetic_op(char op) {
if (op == '+') {
return 1;
} else if (op == '-') {
return 1;
} else if (op == '*') {
return 1;
} else if (op == '/') {
return 1;
} else if (op == '%') {
return 1;
} else {
return 0;
}
}
static inline void replace_with_q_mark(
char grouping_digest, int grouping_lim, int* grouping_count, char** p_r, char* grouping_limit_exceeded
) {
if (*grouping_count < grouping_lim) {
**p_r = '?';
(*p_r)++;
if (grouping_digest) {
*grouping_count += 1;
}
} else {
if (!(*grouping_limit_exceeded)) {
**p_r = '.';
(*p_r)++;
**p_r = '.';
(*p_r)++;
**p_r = '.';
(*p_r)++;
*grouping_limit_exceeded=1;
} else {
// since delimiters are always copied, if 'grouping_lim' is exceeded, we remove any extra ','
// that have been copied after the previously placed '...'.
//
// NOTE: Avoid copying delimiters in case of query grouping can lead to commas not being copied
// before values not being replaced, like 'NULL' values.
if (*(*p_r - 1) == ',') {
(*p_r)--;
}
}
}
}
/**
* @brief Helper functiont that initializes the supplied 'options' struct with the configuration variables
* values.
*
* @param opts The options struct to be initialized.
*/
static inline void get_pgsql_options(options* opts) {
opts->lowercase = pgsql_thread___query_digests_lowercase;
opts->replace_null = pgsql_thread___query_digests_replace_null;
opts->replace_number = pgsql_thread___query_digests_no_digits;
opts->grouping_limit = pgsql_thread___query_digests_grouping_limit;
opts->groups_grouping_limit = pgsql_thread___query_digests_groups_grouping_limit;
opts->keep_comment = pgsql_thread___query_digests_keep_comment;
opts->max_query_length = pgsql_thread___query_digests_max_query_length;
}
/**
* @brief Enum holding all the states responsible for value parsing using during 'stage 1' parsing.
*/
enum p_st {
st_no_mark_found = 0,
st_cmnt_type_1 = 1,
st_cmnt_type_2 = 2,
st_literal_string = 3,
st_literal_number = 4,
st_replace_null = 5,
st_dollar_quote_string = 6,
st_pg_typecast = 7,
st_literal_prefix_type = 8,
st_replace_boolean = 9,
st_array_literal = 10,
st_quoted_identifier = 11
};
/**
* @brief Parsing information from received query and the result buffer shared between the different
* processing stages.
*/
typedef struct shared_st {
/* @brief Global computed compression offset from the previous iteration. Used when uncompressed query
exceeds the maximum buffer side specified by `pgsql_thread___query_digests_max_query_length` */
int gl_c_offset;
/* @brief Maximum length of the resulting digest. */
int d_max_len;
/* @brief Pointer to current reading position of the supplied query. */
const char* q;
/* @brief Length of the supplied query. */
int q_len;
/* @brief Current position of the iteration over the supplied queried. */
int q_cur_pos;
/* @brief Pointer to the initial position of the result buffer. */
char* res_init_pos;
/* @brief Pointer to the initial position of the result buffer *for current processing iteration* */
char* res_it_init_pos;
/* @brief Current position of the iteration over the return buffer. */
char* res_cur_pos;
/* @brief Position in the return buffer prior to the start of any parsing st that isn't 'no_mark_found'. */
char* res_pre_pos;
/* @brief The current state being processed by 'stage_1'. */
enum p_st st;
/* @brief Last copied char to the result buffer. */
char prev_char;
/* @brief Preserve currently imposed 'prev_char' in **on current** char processing instead of replacing it. */
bool keep_prev_char;
/* @brief Decides whether or not the next char should be copy during 'stage_1'. */
bool copy_next_char;
} shared_st;
/**
* @brief State used for parsing 'type_1' comments, i.e: /\* *\/.
*/
typedef struct cmnt_type_1_st {
/* @brief Counter holding the length of the 'cmd' comment currently being processed. */
int cur_cmd_cmnt_len;
/**
* @brief Flag showing first comment parsing state. '0' when no comment or end has been found, and '1'
* when the first comment has already been found.
* @details This flag is NEVER reset, since only the first found comment is retrieved for being further
* processed.
*/
int fst_cmnt_end;
/* @brief Counter keeping track of the number of chars copied into 'first_comment' buffer. */
int fst_cmnt_len;
/* @brief Nesting level for nested comments. */
int nest_level;
} cmnt_type_1_st;
/**
* @brief State used for parsing 'literal strings' values, i.e: 'foo', etc..
*/
typedef struct literal_string_st {
/**
* @brief Boolean flag showing if the first delimiter from a literal string has been found.
* '0' when hasn't yet been found, and '1' while in the processing a literal string.
*/
int delim_num;
/* @brief Found char delimiter found for the literal string being processed. */
char delim_char;
const char* q_start_pos;
bool is_unicode; /* set only for U&'...' */
} literal_string_st;
/**
* @brief State used for parsing 'quoted identifier' values, i.e: "foo", etc..
*/
typedef struct quoted_identifier_st {
int delim_num; // 0 = not started, 1 = in progress
char delim_char; // Always '"' for PostgreSQL
const char* q_start_pos; // Start position in query
} quoted_identifier_st;
/**
* @brief State used for parsing 'literal digit' values, e.g: 84, 0x100, 1E-10, etc...
*/
typedef struct literal_digit_st {
bool first_digit;
char* start_pos;
} literal_digit_st;
/**
* State used for parsing 'literal strings' values, i.e: 'foo', "bar", etc..
*
*/
typedef struct dollar_quote_string_st {
const char* tag_start; // pointer to start of $tag$
size_t tag_len; // length of tag (can be 0 for $$)
} dollar_quote_string_st;
/**
* @brief Created for an alternative implementation of NULL parsing.
* Currently unused. TODO: Remove.
*/
typedef struct literal_null_st {
int null_pos;
} literal_null_st;
/**
* @brief State used for parsing PostgreSQL type casts, i.e: ::typename.
*/
typedef struct pg_typecast_st {
bool started;
} pg_typecast_st;
/**
* @brief State used for parsing PostgreSQL array literals, i.e: ARRAY[...]
*/
typedef struct array_literal_st {
int bracket_depth; // Track nesting depth of brackets
} array_literal_st;
/**
* @brief State used for 'stage_1' parsing.
*/
typedef struct stage_1_st {
struct cmnt_type_1_st cmnt_type_1_st;
struct literal_string_st literal_str_st;
struct literal_digit_st literal_dig_st;
struct dollar_quote_string_st dollar_quote_str_st;
struct pg_typecast_st pg_tc_st;
struct array_literal_st array_st;
struct quoted_identifier_st quoted_iden_st;
/* @brief Holds the previous iteration parsing ending position. */
char* pre_it_pos;
/**
* @brief Previous iteration parsing ending position for 'stage_1'.
* @details This position should be kept as the 'stage_1' final position may differ from final positions
* for later stages. This event takes place when 'stage_1' hasn't finished parsing a value which
* requires copying (i.e. a 'number literal') and digest buffer runned out of space. Under this
* circunstance, later stages don't process the 'number literal' interval, but copy it's values in case
* that a later 'stage_1' iteration can resume the literal parsing.
*/
char* new_end_pos;
} stage_1_st;
/**
* @brief Holds the state used for 'stage_2' parsing.
*/
typedef struct stage_2_st {
/* @brief Previous iteration last parsing position in the result buffer, after the stage
compression has taken place. */
char* pre_it_pos;
/* @brief Last iteration computed compression offset resulted after stage processing. */
int c_offset;
} stage_2_st;
typedef struct stage_3_st {
/* @brief Previous iteration last parsing position in the result buffer, after the stage
compression has taken place. */
char* pre_it_pos;
/* @brief Last iteration computed compression offset resulted after stage processing. */
int c_offset;
} stage_3_st;
typedef struct stage_4_st {
/* @brief Previous iteration last parsing position in the result buffer, after the stage
compression has taken place. */
char* pre_it_pos;
/* @brief Last iteration computed compression offset resulted after stage processing. */
int c_offset;
} stage_4_st;
static __attribute__((always_inline)) inline
void init_shared_st(struct shared_st* shared_st, const char* const q, int q_len, int d_max_len, char* res) {
shared_st->q = q;
shared_st->q_len = q_len;
shared_st->d_max_len = d_max_len;
// all position start with the beginning of the result buffer
shared_st->res_init_pos = res;
shared_st->res_it_init_pos = res;
shared_st->res_cur_pos = res;
shared_st->res_pre_pos = res;
// initial state for the first stage state machine
shared_st->st = st_no_mark_found;
}
static __attribute__((always_inline)) inline
void init_stage_1_st(struct stage_1_st* fst_stage_st) {
fst_stage_st->literal_dig_st.first_digit = 1;
}
static inline int get_digest_max_len(int len, int max_query_length) {
int digest_max_len = 0;
if (len > max_query_length) {
digest_max_len = max_query_length;
} else {
digest_max_len = len;
}
return digest_max_len;
}
static inline char* get_result_buffer(int len, char* buf) {
char* r = NULL;
if (buf == NULL) {
r = (char *) calloc(len + SIZECHAR, sizeof(char));
} else {
r = buf;
}
return r;
}
/**
* @brief Return the next st to be processed. State filtering based on end of query being reached is also
* performed here.
*
* @param shared_st The shared processing state used to decide which is the next 'processing state'.
*
* @return The next processing state.
*/
static __attribute__((always_inline)) inline
enum p_st get_next_st(const options* opts, struct shared_st* shared_st) {
char prev_char = shared_st->prev_char;
enum p_st st = st_no_mark_found;
// cmnt type 1 - start with '/*'
if(
// v1_crashing_payload_05
shared_st->q_cur_pos < (shared_st->q_len - 2) &&
*shared_st->q == '/' && *(shared_st->q+1) == '*'
) {
st = st_cmnt_type_1;
}
// cmnt type 2 - --...
else if (*shared_st->q == '-' && shared_st->q_cur_pos < (shared_st->q_len - 1) &&
*(shared_st->q + 1) == '-')
{
// PG: -- starts comment regardless of following space
if (prev_char != '-') { st = st_cmnt_type_2; }
else if (shared_st->q_cur_pos == 0) { st = st_cmnt_type_2; }
}
// dollar-quoted string start (Postgres: $tag$ or $$)
else if (*shared_st->q == '$') {
// Check for a PostgreSQL dollar-quoted string.
// Format: $tag$ ... $tag$
//
// The tag may be empty or consist only of letters, digits, or underscores.
// Example valid tags: $$, $foo$, $TAG_123$
//
// Here we scan characters after the first '$' to verify that:
// 1. All tag characters are [A-Za-z0-9_], and
// 2. The tag is terminated by another '$'
//
// If so, we treat it as the start of a dollar-quoted string literal.
const char* p = shared_st->q + 1;
while (p < shared_st->q + (shared_st->q_len - shared_st->q_cur_pos) &&
((*p >= 'A' && *p <= 'Z') || (*p >= 'a' && *p <= 'z') || (*p >= '0' && *p <= '9') || *p == '_')) {
p++;
}
if (p < shared_st->q + (shared_st->q_len - shared_st->q_cur_pos) && *p == '$') {
st = st_dollar_quote_string; // add new enum state for dollar-quoted string
}
}
// string - single-quote is string in both
else if (is_token_char(shared_st->prev_char) && *shared_st->q == '\'') {
st = st_literal_string;
}
// double-quoted identifier
else if (is_token_char(shared_st->prev_char) && *shared_st->q == '"') {
st = st_quoted_identifier;
}
// may be digit - start with digit
else if (is_token_char(prev_char) && is_digit_char(*shared_st->q)) {
st = st_literal_number;
}
// NULL processing
else if (
is_token_char(shared_st->prev_char) &&
(*shared_st->q == 'n' || *shared_st->q == 'N') && opts->replace_null
) {
st = st_replace_null;
} // PostgreSQL type cast ::typename
else if (prev_char != '"' &&
shared_st->q_cur_pos < shared_st->q_len - 1 &&
*shared_st->q == ':' && *(shared_st->q + 1) == ':') {
// Peek the first char after '::'
const char* p = shared_st->q + 2;
// Valid starts for a PostgreSQL type name:
// - letter or underscore
// - double-quote
if (is_normal_char(*p) || *p == '"') {
st = st_pg_typecast;
}
}
// ARRAY literal detection
else if (
is_token_char(shared_st->prev_char) &&
(tolower(*shared_st->q) == 'a')
) {
// Check for "ARRAY" keyword
size_t remaining = shared_st->q_len - shared_st->q_cur_pos;
// We need at least "ARRAY[" which is 6 characters
if (remaining >= 6) {
const char* p = shared_st->q;
// Check for "ARRAY" (case-insensitive)
if (tolower(p[0]) == 'a' && tolower(p[1]) == 'r' &&
tolower(p[2]) == 'r' && tolower(p[3]) == 'a' &&
tolower(p[4]) == 'y') {
// Now check if next character after "ARRAY" is '[' or space followed by '['
size_t pos = 5; // After "ARRAY"
// Skip any whitespace
while (pos < remaining && is_space_char(p[pos])) {
pos++;
}
// Check if we have '[' at this position
if (pos < remaining && p[pos] == '[') {
st = st_array_literal;
}
}
}
}
// Boolean literal detection
else if (
is_token_char(shared_st->prev_char) &&
(tolower(*shared_st->q) == 't' || tolower(*shared_st->q) == 'f')
) {
st = st_replace_boolean;
} else if (is_token_char(shared_st->prev_char)) {
const char* q = shared_st->q;
size_t remaining = shared_st->q_len - shared_st->q_cur_pos;
// U&' prefix
if (remaining >= 3 && (q[0] == 'U' || q[0] == 'u') &&
q[1] == '&' && q[2] == '\'') {
st = st_literal_prefix_type;
} else if (remaining >= 2 && q[1] == '\'') { // Single-char prefixes
char lower_prefix = (char)tolower(q[0]);
if (lower_prefix == 'e' ||
lower_prefix == 'b' ||
lower_prefix == 'x') {
st = st_literal_prefix_type;
}
}
}
return st;
}
static __attribute__((always_inline)) inline
void inc_proc_pos(shared_st* shared_st) {
if (shared_st->keep_prev_char == false) {
shared_st->prev_char = *shared_st->q;
} else {
shared_st->keep_prev_char = false;
}
shared_st->q++;
shared_st->q_cur_pos++;
}
/**
* @brief Copy the next character and increment the current processing position.
*
* @param opts Options that determine how the next character is going to be copied.
* @param shared_st The shared state to modify.
*/
static __attribute__((always_inline)) inline
void copy_next_char(shared_st* shared_st, const options* opts) {
// copy the next character; translating any space char into ' '
if (opts->lowercase==0) {
*shared_st->res_cur_pos++ = !is_space_char(*shared_st->q) ? *shared_st->q : ' ';
} else {
*shared_st->res_cur_pos++ = !is_space_char(*shared_st->q) ? tolower(*shared_st->q) : ' ';
}
inc_proc_pos(shared_st);
}
static thread_local char cur_cmd_cmnt[FIRST_COMMENT_MAX_LENGTH];
/**
* @brief Safer version of 'is_digit_string' performing boundary checks.
*
* @param shared_st The shared state used for the boundary checks.
* @param f Initial position of the string being checked.
* @param t Final position of the string being checked.
*
* @return '1' if the supplied string is recognized as a 'digit_string', '0' otherwise.
*/
static char is_digit_string_2(shared_st* shared_st, char *f, char *t)
{
if(f == t)
{
if(is_digit_char(*f))
return 1;
else
return 0;
}
int is_hex = 0;
int i = 0;
// 0x, 0X, n.m, nE+m, nE-m, Em
while(f != t)
{
char is_float = 0;
if (f > shared_st->res_init_pos) {
is_float = *f == '.' || tolower(*f) == 'e' || (tolower(*(f-1)) == 'e' && (*f == '+' || *f == '-'));
} else {
is_float = *f == '.' || tolower(*f) == 'e';
}
if(f > shared_st->res_init_pos && i == 1 && *(f-1) == '0' && (*f == 'x' || *f == 'X'))
{
is_hex = 1;
}
// none hex
else if(!is_hex && !is_digit_char(*f) && is_float == 0)
{
return 0;
}
// hex
else if(is_hex && !is_hex_char(*f))
{
return 0;
}
f++;
i++;
}
return 1;
}
/**
* @brief Process a detected comment of type "/\* *\/". Determines when to exit the 'st_cmnt_type_1' state.
* @details Function assumes that 'shared_st->q' is pointing to the initial mark '/' of the comment start, and
* that it's safe to look forward for '*'. State 'st_cmnt_type_1' doesn't copy any data to the result
* buffer, unless the comment is a 'cmd' comment, in which case the comment is copied from the query to the
* resulting buffer **after** the comment final delimiter '*\/' has been found.
*
* @param shared_st Shared state used to continue the query processing.
* @param c_t_1_st The 'comment_type_1' parsing state, holds the found information about the comment being parsed.
*
* @return The next processing state, it could be either:
* - 'st_cmnt_type_1' if the comment hasn't yet completed to be parsed.
* - 'st_no_mark_found' if the comment has completed to be parsed.
*/
static __attribute__((always_inline)) inline
enum p_st process_cmnt_type_1(const options* opts, shared_st* shared_st, cmnt_type_1_st* c_t_1_st, char** fst_cmnt) {
enum p_st next_st = st_cmnt_type_1;
const char* res_final_pos = shared_st->res_init_pos + shared_st->d_max_len;
// initial mark "/*" detection
// comments are not copied by while processed, boundary checks should rely on 'q_cur_pos' and 'q_len'.
if (shared_st->q_cur_pos <= (shared_st->q_len-2) && *shared_st->q == '/' && *(shared_st->q+1) == '*') {
if (c_t_1_st->nest_level == 0)
c_t_1_st->cur_cmd_cmnt_len = 0;
// Increment nesting level /*
c_t_1_st->nest_level++;
// copy the initial mark "/*" if comment preserving is enabled
if (opts->keep_comment) {
cur_cmd_cmnt[c_t_1_st->cur_cmd_cmnt_len] = *(shared_st->q);
c_t_1_st->cur_cmd_cmnt_len++;
cur_cmd_cmnt[c_t_1_st->cur_cmd_cmnt_len] = *(shared_st->q + 1);
c_t_1_st->cur_cmd_cmnt_len++;
}
if (c_t_1_st->fst_cmnt_end == 0 && c_t_1_st->nest_level > 1 &&
c_t_1_st->fst_cmnt_len < FIRST_COMMENT_MAX_LENGTH - 2) {
assert(*fst_cmnt);
char* next_fst_cmnt_char = *fst_cmnt + c_t_1_st->fst_cmnt_len;
*next_fst_cmnt_char = *(shared_st->q);
next_fst_cmnt_char++;
*next_fst_cmnt_char = *(shared_st->q + 1);
next_fst_cmnt_char++;
c_t_1_st->fst_cmnt_len += 2;
}
// discard processed "/*"
shared_st->q += 2;
shared_st->q_cur_pos += 2;
// v1_crashing_payload_04
if (shared_st->q_cur_pos >= shared_st->q_len - 1) {
if (c_t_1_st->fst_cmnt_end == 0 && *fst_cmnt != NULL) {
// ensure there is a terminator at logical end
char* c_end = *fst_cmnt + c_t_1_st->fst_cmnt_len;
*c_end = 0;
c_t_1_st->fst_cmnt_end = 1;
}
if (opts->keep_comment) {
cur_cmd_cmnt[c_t_1_st->cur_cmd_cmnt_len] = 0;
}
c_t_1_st->nest_level = 0;
return st_no_mark_found;
}
}
// TODO: Check if there is exclusion between this regular first comments and first comment that are 'cmd'
// comments by spec. To further clarify, should comments '/*!' be not considered first comments to be copied
// into the supplied 'fst_cmnt' memory? Or should they be considered for further processing?
// {
// we are parsing a "/*" comment
if (opts->keep_comment) {
// copy the char into 'cur_cmd_cmnt'
if (c_t_1_st->cur_cmd_cmnt_len < FIRST_COMMENT_MAX_LENGTH-1) {
cur_cmd_cmnt[c_t_1_st->cur_cmd_cmnt_len] = *shared_st->q;
c_t_1_st->cur_cmd_cmnt_len++;
}
}
// first comment hasn't finished, we are yet copying it
if (c_t_1_st->fst_cmnt_end == 0 &&
c_t_1_st->fst_cmnt_len < FIRST_COMMENT_MAX_LENGTH - 1) {
if (*fst_cmnt == NULL) {
// initialize the 'first_comment' and set a final NULL terminator for safety
*fst_cmnt = (char*)malloc(FIRST_COMMENT_MAX_LENGTH);
*(*fst_cmnt + FIRST_COMMENT_MAX_LENGTH - 1) = 0;
}
char* next_fst_cmnt_char = *fst_cmnt + c_t_1_st->fst_cmnt_len;
*next_fst_cmnt_char = !is_space_char(*shared_st->q) ? *shared_st->q : ' ';
c_t_1_st->fst_cmnt_len++;
}
if (shared_st->prev_char == '*' && *shared_st->q == '/') {
// Decrement nesting level when we encounter
if (c_t_1_st->nest_level > 0)
c_t_1_st->nest_level--;
// Only end the comment when we're back at nest level 0
if (c_t_1_st->nest_level == 0) {
if (opts->keep_comment) {
cur_cmd_cmnt[c_t_1_st->cur_cmd_cmnt_len] = 0;
int res_free_space = res_final_pos - shared_st->res_cur_pos;
int comment_size = c_t_1_st->cur_cmd_cmnt_len;
int copy_length = res_free_space > comment_size ? comment_size : res_free_space;
memcpy(shared_st->res_cur_pos, cur_cmd_cmnt, copy_length);
shared_st->res_cur_pos += copy_length;
if (*(shared_st->res_cur_pos - 1) != ' ' && shared_st->res_cur_pos != res_final_pos) {
*shared_st->res_cur_pos++ = ' ';
}
// Re-initialize the comment state
c_t_1_st->cur_cmd_cmnt_len = 0;
}
if (shared_st->res_init_pos != shared_st->res_cur_pos && shared_st->res_cur_pos != res_final_pos &&
// if the prev copied char isn't a space comment wasn't space separated in the query:
// ```
// Q: `SELECT/*FOO*/1`
// ^ no space char
// ```
// thus we impose an extra space in replace for the ommited comment
*(shared_st->res_cur_pos - 1) != ' '
) {
*shared_st->res_cur_pos++ = ' ';
}
// back to main shared_st->query parsing state
shared_st->prev_char = ' ';
next_st = st_no_mark_found;
// Finalize first comment if we were tracking it
if (c_t_1_st->fst_cmnt_end == 0 && *fst_cmnt != NULL) {
if (c_t_1_st->fst_cmnt_len >= 2) {
c_t_1_st->fst_cmnt_len -= 2;
}
char* c_end = *fst_cmnt + c_t_1_st->fst_cmnt_len;
*c_end = 0;
c_t_1_st->fst_cmnt_end = 1;
}
shared_st->q_cur_pos += 1;
shared_st->q++;
} else {
// Still in nested comment - don't exit comment state yet
next_st = st_cmnt_type_1;
}
}
// Check if we've reached the end of query
if (shared_st->q_cur_pos >= shared_st->q_len - 1) {
// Finalize first comment if we were tracking it
if (c_t_1_st->fst_cmnt_end == 0 && *fst_cmnt != NULL) {
// ensure there is a terminator at logical end
char* c_end = *fst_cmnt + c_t_1_st->fst_cmnt_len;
*c_end = 0;
c_t_1_st->fst_cmnt_end = 1;
}
if (opts->keep_comment) {
cur_cmd_cmnt[c_t_1_st->cur_cmd_cmnt_len] = 0;
}
// reset nesting so parser isn't left in the middle of a comment
c_t_1_st->nest_level = 0;
return st_no_mark_found;
}
return next_st;
}
/**
* @brief Handles the processing state 'st_cmnt_type_2'.
* @details State 'st_cmnt_type_2' doesn't copy any data to the result buffer. It just skip the current char
* by char until finding the delimiter.
*
* @param shared_st Shared state used to continue the query processing.
*
* @return The next processing state, it could be either:
* - 'st_cmnt_type_2' if the comment hasn't yet completed to be parsed.
* - 'st_no_mark_found' if the comment has completed to be parsed.
*/
static __attribute__((always_inline)) inline
enum p_st process_cmnt_type_2(shared_st* shared_st) {
enum p_st next_state = st_cmnt_type_2;
// discard processed "-- "
if (
shared_st->q_cur_pos <= (shared_st->q_len - 3) &&
*shared_st->q == '-' && *(shared_st->q+1)=='-'
) {
shared_st->q += 2;
shared_st->q_cur_pos += 2;
}
if (*shared_st->q == '\n' || *shared_st->q == '\r' || (shared_st->q_cur_pos >= shared_st->q_len - 1)) {
next_state = st_no_mark_found;
shared_st->prev_char = ' ';
shared_st->q++;
shared_st->q_cur_pos++;
}
return next_state;
}
static inline void try_consume_uescape(shared_st* s) {
const char* p = s->q;
size_t rem = s->q_len - s->q_cur_pos;
/* skip whitespace */
while (rem && is_space_char(*p)) { p++; rem--; }
/* case-insensitive UESCAPE */
if (rem < 7 ||
tolower(p[0]) != 'u' || tolower(p[1]) != 'e' ||
tolower(p[2]) != 's' || tolower(p[3]) != 'c' ||
tolower(p[4]) != 'a' || tolower(p[5]) != 'p' ||
tolower(p[6]) != 'e') {
return;
}
p += 7; rem -= 7;
while (rem && is_space_char(*p)) { p++; rem--; }
/* optional E prefix */
if (rem && (p[0] == 'e' || p[0] == 'E')) {
p++; rem--;
}
if (!rem || *p != '\'') return;
/* consume quoted escape literal */
p++; rem--;
while (rem) {
if (*p == '\\' && rem > 1) {
p += 2; rem -= 2;
continue;
}
if (*p == '\'') {
p++; rem--;
break;
}
p++; rem--;
}
/* commit */
size_t consumed = (s->q_len - s->q_cur_pos) - rem;
s->q += consumed;
s->q_cur_pos += consumed;
}
/**
* @brief Handles the processing state 'st_literal_string'.
* @details State 'st_literal_string' doesn't copy any data to the result buffer, instead, it just skips the
* current char until the end delimiter is found. Then replaces the previous position in the result buffer
* with the mark '?'.
*
* TODO: This function currently doesn't take into account if 'NO_BACKSLASH_ESCAPES' sql_mode is being used.
* This can lead to 'stats_pgsql_query_digest' pollution because a valid query could be received with strings
* ending in '\''. With current implementation this final '\'' will be collapsed, leading to a string not
* properly finding the target string delimiter. To solve this scenario the following approach could be taken:
* - Add an additional parameter to 'Query_Info::begin' that propagates 'no_backslash_escapes' from
* 'PgSQL_Session::client_myds::myconn::options' through 'Query_Info::query_parser_init'.
* - Add a new parameter to 'query_parser_init' (or reuse currently unused 'flags' for this purpose).
* - Add a new parameter to 'pgsql_query_digest_and_first_comment_2' to propagate this flags.
* - Add a new field into the 'options' struct defined in this file for holding such flags.
* - Pass 'options' already received by 'stage_1_parsing' into this function and make use of it for deciding
* whether to ignore the processing of chars within the string when are preceded by '\'.
* This is just a proposal and a future implementation may be subject to change.
*
* @param shared_st Shared state used to continue the query processing.
* @param str_st The literal string parsing state, holds the information so far found about the state.
*
* @return The next processing state, it could be either:
* - 'st_literal_string' if the string literal hasn't yet completed to be parsed.
* - 'st_no_mark_found' if the string literal has completed to be parsed.
*/
static __attribute__((always_inline)) inline
enum p_st process_literal_string(shared_st* shared_st, literal_string_st* str_st) {
enum p_st next_state = st_literal_string;
bool is_unicode = str_st->is_unicode;
// process the first delimiter
if (str_st->delim_num == 0) {
// store found delimiter
str_st->q_start_pos = shared_st->q;
str_st->delim_char = *shared_st->q;
str_st->delim_num = 1;
// NOTE: Don't increment the position in query buffer, as explained in 'stage_1_parsing'.
return next_state;
}
// need to be ignored case
if(shared_st->q > str_st->q_start_pos + SIZECHAR)
{
if(
(shared_st->prev_char == '\\' && *shared_st->q == '\\') || // to process '\\\\', '\\'
(shared_st->prev_char == '\\' && *shared_st->q == str_st->delim_char) || // to process '\''
(shared_st->prev_char == str_st->delim_char && *shared_st->q == str_st->delim_char) // to process ''''
)
{
shared_st->keep_prev_char = true;
shared_st->prev_char = 'X';
// NOTE: Don't increment the position in query buffer. See 'stage_1_parsing' doc.
return next_state;
}
}
// satisfied closing string - swap string to ?
if(
*shared_st->q == str_st->delim_char &&
(shared_st->q_len == shared_st->q_cur_pos+1 || *(shared_st->q + SIZECHAR) != str_st->delim_char)
) {
// NOTE: may not be necessary since we don't increment 'res_cur_pos' during this state. Since all the
// characters are ignored.
shared_st->res_cur_pos = shared_st->res_pre_pos;
// place the replacement mark
*shared_st->res_cur_pos++ = '?';
shared_st->prev_char = '?';
// don't copy this char if last
if (shared_st->q_len == shared_st->q_cur_pos + 1) {
shared_st->copy_next_char = 0;
// keep the same state, no token was found
return next_state;
}
// reinit the string literal state
str_st->delim_char = 0;
str_st->delim_num = 0;
str_st->q_start_pos = 0;
// update the shared state
shared_st->prev_char = str_st->delim_char;
if(shared_st->q_cur_pos < shared_st->q_len) {
shared_st->q++;
}
shared_st->q_cur_pos++;
// exit the literal parsing state
next_state = st_no_mark_found;
if (is_unicode) {
try_consume_uescape(shared_st);
str_st->is_unicode = 0;
}
}
return next_state;
}
/**
* @brief Handles the processing state 'st_dollar_quote_string'.
*
* @param shared_st Shared state used to continue the query processing.
* @param dq_st The dollar-quoted string parsing state, holds the information so far found about the state.
*
* @return The next processing state, it could be either:
* - 'st_dollar_quote_string' if the dollar-quoted string hasn't yet completed to be parsed.
* - 'st_no_mark_found' if the dollar-quoted string has completed to be parsed.
*/
static __attribute__((always_inline)) inline
enum p_st process_dollar_quote_string(shared_st* shared_st, dollar_quote_string_st* dq_st)
{
enum p_st next_state = st_dollar_quote_string;
// Number of bytes remaining in the input buffer
size_t remaining = shared_st->q_len - shared_st->q_cur_pos;
// ============================================================
// PHASE 1 — Detect and initialize the opening $tag$
// ============================================================
if (dq_st->tag_start == NULL) {
// At least "$$" is needed to form a valid opening delimiter
if (remaining < 2) {
return st_no_mark_found;
}
// Start scanning after the first '$' to read the tag
const char* p = shared_st->q + 1; // skip first $
// Read tag characters until another '$' or buffer end
// Valid characters: [A-Za-z0-9_]
while ((size_t)(p - shared_st->q) < remaining && *p != '$') {
char c = *p;
if (!((c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
c == '_'))
{
// Illegal tag character -> this is not a dollar-quote
return st_no_mark_found;
}
p++;
}
// If we reached end-of-buffer or didn't find a closing '$', it's not valid
if ((size_t)(p - shared_st->q) >= remaining || *p != '$') {
return st_no_mark_found;
}
// Store tag metadata:
// Example: $TAG$ -> tag_start points to 'T', tag_len = 3
dq_st->tag_start = shared_st->q + 1; // first char of tag
dq_st->tag_len = (int)(p - dq_st->tag_start); // 0 for $$
// Check that skipping "$tag$" will not exceed buffer bounds
if (shared_st->q_cur_pos + dq_st->tag_len + 2 > (size_t)shared_st->q_len)
return st_no_mark_found;
// Advance input pointers past the opening delimiter
shared_st->q += dq_st->tag_len + 2;
shared_st->q_cur_pos += dq_st->tag_len + 2;
}
// ============================================================
// PHASE 2 — Inside the dollar-quoted string
// Look for the closing delimiter $tag$
// ============================================================
remaining = shared_st->q_len - shared_st->q_cur_pos;
// Check if enough bytes remain to match the closing delimiter
if (remaining >= (size_t)(dq_st->tag_len + 2)) {
// Validate: '$' + tag + '$'
if (*shared_st->q == '$' &&
memcmp(shared_st->q + 1, dq_st->tag_start, dq_st->tag_len) == 0 &&
*(shared_st->q + 1 + dq_st->tag_len) == '$')
{
// Found the closing delimiter
// Replace the entire dollar-quoted string with a single '?'
shared_st->res_cur_pos = shared_st->res_pre_pos;
*shared_st->res_cur_pos++ = '?';
// Skip past the closing delimiter
shared_st->q += dq_st->tag_len + 2;
shared_st->q_cur_pos += dq_st->tag_len + 2;
// Reset stored tag so the next string can be detected
dq_st->tag_start = NULL;
dq_st->tag_len = 0;
return st_no_mark_found;
}
} else {
// Not enough bytes left to form a closing delimiter -> safe exit
return st_no_mark_found;
}
// Reached end-of-buffer while still inside the string
return next_state;
}
/**
* @brief Handles the processing state 'st_literal_digit'.
*
* @param shared_st Shared state used to continue the query processing.
* @param digit_st The literal digit parsing state, holds the information so far found about the state.
* @param opts TODO: Currently unused, remove.
*
* @return The next processing state, it could be either:
* - 'st_literal_digit' if the literal number hasn't yet completed to be parsed.
* - 'st_no_mark_found' if the literal number has completed to be parsed.
*/
static __attribute__((always_inline)) inline
enum p_st process_literal_digit(shared_st* shared_st, literal_digit_st* digit_st, const options* opts) {
enum p_st next_state = st_literal_number;
// process the first digit
if (digit_st->first_digit == 1 && is_token_char(shared_st->prev_char) && is_digit_char(*shared_st->q)) {
// store the start position of digit literal in the result buffer for later iterations
digit_st->start_pos = shared_st->res_pre_pos;
// store the first digit
*shared_st->res_cur_pos = *shared_st->q;
digit_st->first_digit = 0;
// NOTE: Don't increment the position in query buffer, as explained in 'stage_1_parsing'.
}
// token char or last char
char is_float_char = *shared_st->q == '.' ||
( tolower(shared_st->prev_char) == 'e' && ( *shared_st->q == '-' || *shared_st->q == '+' ) );
if ((is_token_char(*shared_st->q) && is_float_char == 0) || shared_st->q_len == shared_st->q_cur_pos + 1) {
if (is_digit_string_2(shared_st, digit_st->start_pos, shared_st->res_cur_pos)) {
shared_st->res_cur_pos = digit_st->start_pos;
// place the replacement mark
*shared_st->res_cur_pos++ = '?';
shared_st->prev_char = '?';
// don't copy this char if last and is not token
if (is_token_char(*shared_st->q) == 0 && shared_st->q_len == shared_st->q_cur_pos + 1) {
shared_st->copy_next_char = 0;
// keep the same state, no token was found
return next_state;
}
}
digit_st->start_pos = NULL;
digit_st->first_digit = 0;
next_state = st_no_mark_found;
}
return next_state;
}
/**
* @brief Alternative impl for 'NULL' replacement, unused right now. TODO: Remove.
*/
static __attribute__((always_inline)) inline
enum p_st process_replace_null_single_chars(shared_st* shared_st, literal_null_st* null_st) {
enum p_st next_st = st_replace_null;
const char* null_str = "null";
if (null_st->null_pos <= 3) {
if (tolower(*shared_st->q) == null_str[null_st->null_pos]) {
null_st->null_pos++;
} else {
next_st = st_no_mark_found;
}
if (shared_st->q_cur_pos == shared_st->q_len - 1 && null_st->null_pos == 4) {
// no need for changing the state it's the last char
shared_st->copy_next_char = 0;
shared_st->res_cur_pos = shared_st->res_pre_pos;
// place the replacement mark
*shared_st->res_cur_pos++ = '?';
shared_st->prev_char = '?';
}
} else if (null_st->null_pos == 4){
if (is_token_char(*shared_st->q)) {
shared_st->copy_next_char = 0;
shared_st->res_cur_pos = shared_st->res_pre_pos;
// place the replacement mark
*shared_st->res_cur_pos++ = '?';
shared_st->prev_char = '?';
// don't copy current char, go immediately back to initial state
next_st = st_no_mark_found;
}
}
return next_st;
}
/**
* @brief Process the 'st_replace_null' state.
* @details This state processing function doesn't check if 'replace_null' feature is enabled or not. If the
* feature isn't enabled, this state should never be reached. The state 'st_replace_null' is a one operation
* state always, if the 'NULL' value to be replaced isn't found, processing goes back to 'st_no_mark_found'
* state immediately, for this reason, this state is responsible of copying the current char before
* returning.
*
* @param shared_st Shared state used to continue the query processing.
* @param opts Options to be used for the copying of the current char.
*/
static __attribute__((always_inline)) inline
enum p_st process_replace_null(shared_st* shared_st, const options* opts) {
enum p_st next_st = st_no_mark_found;
char null_found = 0;
if ((shared_st->q_len - shared_st->q_cur_pos) > 4) {
null_found =
(*shared_st->q == 'N' || *shared_st->q == 'n') &&
(*(shared_st->q+1) == 'U' || *(shared_st->q+1) == 'u') &&
(*(shared_st->q+2) == 'L' || *(shared_st->q+2) == 'l') &&
(*(shared_st->q+3) == 'L' || *(shared_st->q+3) == 'l') &&
is_token_char(*(shared_st->q+4));
} else if ((shared_st->q_len - shared_st->q_cur_pos) == 4) {
null_found =
(*shared_st->q == 'N' || *shared_st->q == 'n') &&
(*(shared_st->q+1) == 'U' || *(shared_st->q+1) == 'u') &&
(*(shared_st->q+2) == 'L' || *(shared_st->q+2) == 'l') &&
(*(shared_st->q+3) == 'L' || *(shared_st->q+3) == 'l');
} else {
null_found = 0;
}
if (null_found == 1) {
// place the replacement mark
shared_st->res_cur_pos = shared_st->res_pre_pos;
*shared_st->res_cur_pos++ = '?';
shared_st->prev_char = '?';
shared_st->q += 4;
shared_st->q_cur_pos += 4;
} else {
// process the first char and continue
copy_next_char(shared_st, opts);
}
return next_st;
}
/**
* @brief Process the 'st_pg_typecast' state.
* @details The state 'st_pg_typecast' is responsible of
* skipping the typecast name and any modifiers after the initial "::" has been detected.
*
* @param shared_st Shared state used to continue the query processing.
* @param tc The pg_typecast parsing state, holds the information so far found about the state.
*/
static __attribute__((always_inline)) inline
enum p_st process_pg_typecast(shared_st* s, pg_typecast_st* tc)
{
enum p_st next = st_pg_typecast;
// On entering state
if (!tc->started) {
tc->started = true;
// Skip the initial "::"
s->q += 2;
s->q_cur_pos += 2;
// Skip any whitespace after ::
while (s->q_cur_pos < s->q_len && is_space_char(*s->q)) {
s->q++;
s->q_cur_pos++;
}
}
if (s->q_cur_pos >= s->q_len) {
return st_no_mark_found;
}
char c = *s->q;
// Check if we're in a quoted type name (e.g., "some type")
if (c == '"') {
// Skip opening quote
s->q++;
s->q_cur_pos++;
// Find closing quote
while (s->q_cur_pos < s->q_len && *s->q != '"') {
// Handle escaped quotes
if (*s->q == '\\' && s->q_cur_pos < s->q_len - 1 && s->q[1] == '"') {
s->q += 2;
s->q_cur_pos += 2;
}
else {
s->q++;
s->q_cur_pos++;
}
}
if (s->q_cur_pos < s->q_len && *s->q == '"') {
s->q++;
s->q_cur_pos++;
}
// After quoted identifier, there might be modifiers
c = (s->q_cur_pos < s->q_len) ? *s->q : '\0';
}
// Parse type name (alphanumeric and underscores)
while (s->q_cur_pos < s->q_len &&
((c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') ||
c == '_')) {
s->q++;
s->q_cur_pos++;
c = (s->q_cur_pos < s->q_len) ? *s->q : '\0';
}
// Skip any whitespace
while (s->q_cur_pos < s->q_len && is_space_char(c)) {
s->q++;
s->q_cur_pos++;
c = (s->q_cur_pos < s->q_len) ? *s->q : '\0';
}
// Handle type modifiers (parentheses with parameters)
if (c == '(') {
int paren_depth = 1;
s->q++;
s->q_cur_pos++;
while (s->q_cur_pos < s->q_len && paren_depth > 0) {
c = *s->q;
if (c == '(') {
paren_depth++;
}
else if (c == ')') {
paren_depth--;
}
else if (c == '\'' || c == '"') {
// Skip string literals inside type modifiers
char quote_char = c;
s->q++;
s->q_cur_pos++;
while (s->q_cur_pos < s->q_len && *s->q != quote_char) {
// Handle escaped quotes
if (*s->q == '\\' && s->q_cur_pos < s->q_len - 1 && s->q[1] == quote_char) {
s->q += 2;
s->q_cur_pos += 2;
}
else {
s->q++;
s->q_cur_pos++;
}
}
if (s->q_cur_pos < s->q_len && *s->q == quote_char) {
s->q++;
s->q_cur_pos++;
}
continue;
}
s->q++;
s->q_cur_pos++;
}
c = (s->q_cur_pos < s->q_len) ? *s->q : '\0';
}
// Handle array brackets
while (c == '[' && s->q_cur_pos < s->q_len - 1 && s->q[1] == ']') {
s->q += 2;
s->q_cur_pos += 2;
c = (s->q_cur_pos < s->q_len) ? *s->q : '\0';
// Skip any whitespace
while (s->q_cur_pos < s->q_len && is_space_char(c)) {
s->q++;
s->q_cur_pos++;
c = (s->q_cur_pos < s->q_len) ? *s->q : '\0';
}
}
// End of type name? Now check if we're at a delimiter
if (s->q_cur_pos >= s->q_len ||
is_space_char(c) ||
c == ')' || c == '(' || c == ';' || c == ',' ||
c == '+' || c == '-' || c == '*' || c == '/' ||
c == '=' || c == '<' || c == '>' || c == '@' ||
c == ']' || c == '[') {
// Exit state
return st_no_mark_found;
}
return next;
}
/**
* @brief Process the 'st_replace_boolean' state.
* @details The state 'st_replace_boolean' is a one operation
* state always, if the boolean value to be replaced isn't found, processing goes back to 'st_no_mark_found'
* state immediately, for this reason, this state is responsible of copying the current char before
* returning.
*
* @param shared_st Shared state used to continue the query processing.
* @param opts Options to be used for the copying of the current char.
*/
static __attribute__((always_inline)) inline
enum p_st process_replace_boolean(shared_st* shared_st, const options* opts) {
enum p_st next_st = st_no_mark_found;
char boolean_found = 0;
size_t remaining = shared_st->q_len - shared_st->q_cur_pos;
// Check for "TRUE"
if (tolower(*shared_st->q) == 't' && remaining >= 4) {
if ((tolower(shared_st->q[1]) == 'r' || shared_st->q[1] == 'R') &&
(tolower(shared_st->q[2]) == 'u' || shared_st->q[2] == 'U') &&
(tolower(shared_st->q[3]) == 'e' || shared_st->q[3] == 'E') &&
(remaining == 4 || is_token_char(shared_st->q[4]))) {
// Replace with '?'
shared_st->res_cur_pos = shared_st->res_pre_pos;
*shared_st->res_cur_pos++ = '?';
shared_st->prev_char = '?';
// Skip the boolean literal
shared_st->q += 4;
shared_st->q_cur_pos += 4;
boolean_found = 1;
}
}
// Check for "FALSE"
else if (tolower(*shared_st->q) == 'f' && remaining >= 5) {
if ((tolower(shared_st->q[1]) == 'a' || shared_st->q[1] == 'A') &&
(tolower(shared_st->q[2]) == 'l' || shared_st->q[2] == 'L') &&
(tolower(shared_st->q[3]) == 's' || shared_st->q[3] == 'S') &&
(tolower(shared_st->q[4]) == 'e' || shared_st->q[4] == 'E') &&
(remaining == 5 || is_token_char(shared_st->q[5]))) {
// Replace with '?'
shared_st->res_cur_pos = shared_st->res_pre_pos;
*shared_st->res_cur_pos++ = '?';
shared_st->prev_char = '?';
// Skip the boolean literal
shared_st->q += 5;
shared_st->q_cur_pos += 5;
boolean_found = 1;
}
}
if (!boolean_found) {
// Not a boolean literal - copy the current char and continue
copy_next_char(shared_st, opts);
}
return next_st;
}
/**
* @brief Handles the processing state 'st_array_literal'.
* @details State 'st_array_literal' doesn't copy any data to the result buffer, it just skips the
* current char until the end of the array literal is found. Then replaces the previous position
* in the result buffer with the mark '?'.
*
* @param shared_st Shared state used to continue the query processing.
* @param array_st The array literal parsing state, holds the information so far found about the state.
*
* @return The next processing state, it could be either:
* - 'st_array_literal' if the array literal hasn't yet completed to be parsed.
* - 'st_no_mark_found' if the array literal has completed to be parsed.
*/
static __attribute__((always_inline)) inline
enum p_st process_array_literal(shared_st* shared_st, array_literal_st* array_st) {
enum p_st next_state = st_array_literal;
// 1. INITIALIZATION: Check for "ARRAY" + whitespace + "["
if (array_st->bracket_depth == 0) {
/* Ensure enough input remains for "ARRAY" */
if (shared_st->q_cur_pos + 5 > shared_st->q_len)
return st_no_mark_found;
// We're at the 'A' of "ARRAY"
// Skip "ARRAY" (5 chars)
const char* p = shared_st->q + 5;
size_t remaining = shared_st->q_len - shared_st->q_cur_pos - 5;
// Skip any whitespace
while (remaining > 0 && is_space_char(*p)) {
p++;
remaining--;
}
/* Must be at '[' */
if (remaining == 0 || *p != '[')
return st_no_mark_found;
/* Move shared state to '[' */
size_t skip_count = (size_t)(p - shared_st->q);
shared_st->q += skip_count;
shared_st->q_cur_pos += skip_count;
// We're now at the '[', set bracket depth to 1
array_st->bracket_depth = 1;
/* Skip '[' safely */
if (shared_st->q_cur_pos < shared_st->q_len) {
shared_st->q++;
shared_st->q_cur_pos++;
} else {
return next_state;
}
}
// 2. PROCESSING: Find the end of the array
if (shared_st->q_cur_pos >= shared_st->q_len)
return st_no_mark_found;
// Process the array content
char c = *shared_st->q;
// If we encounter quotes inside the array, skip over the quoted literal so ']' inside quotes is ignored.
if (c == '\'' || c == '"') {
char quote_char = c;
// consume opening quote
shared_st->q++;
shared_st->q_cur_pos++;
while (shared_st->q_cur_pos < shared_st->q_len) {
char cc = *shared_st->q;
// backslash escape
if (cc == '\\' && shared_st->q_cur_pos + 1 < shared_st->q_len) {
shared_st->q += 2;
shared_st->q_cur_pos += 2;
continue;
}
// SQL doubled single-quote
if (quote_char == '\'' && cc == '\'' &&
shared_st->q_cur_pos + 1 < shared_st->q_len &&
*(shared_st->q + 1) == '\'') {
shared_st->q += 2;
shared_st->q_cur_pos += 2;
continue;
}
// closing quote
if (cc == quote_char) {
shared_st->q++;
shared_st->q_cur_pos++;
break;
}
shared_st->q++;
shared_st->q_cur_pos++;
}
// if unterminated, wait for more input
if (shared_st->q_cur_pos >= shared_st->q_len)
return st_no_mark_found;
} else if (c == '$') {
// Check if this is a dollar-quoted string
const char* p = shared_st->q + 1;
size_t remaining = shared_st->q_len - shared_st->q_cur_pos;
size_t tag_len = 0;
// Read the tag (can be empty or [A-Za-z0-9_])
while (tag_len < remaining - 1 && p[tag_len] != '$') {
char tag_char = p[tag_len];
if (!((tag_char >= 'a' && tag_char <= 'z') ||
(tag_char >= 'A' && tag_char <= 'Z') ||
(tag_char >= '0' && tag_char <= '9') ||
tag_char == '_')) {
// Not a valid dollar-quoted string tag character
break;
}
tag_len++;
}
// Check if we have a valid dollar-quoted string opening delimiter
if (tag_len < remaining - 1 && p[tag_len] == '$') {
// We have a valid opening delimiter: $tag$ or $$
const char* tag_start = p;
// Move past the opening delimiter
shared_st->q += tag_len + 2; // Skip $ + tag + $
shared_st->q_cur_pos += tag_len + 2;
// Now find the closing delimiter
while (shared_st->q_cur_pos < shared_st->q_len) {
// Check if we have enough characters for the closing delimiter
if (*shared_st->q == '$' &&
shared_st->q_cur_pos + tag_len + 1 < (size_t)shared_st->q_len) {
// Check if this matches our opening tag
if (memcmp(shared_st->q + 1, tag_start, tag_len) == 0 &&
*(shared_st->q + tag_len + 1) == '$') {
// Found the closing delimiter
shared_st->q += tag_len + 2;
shared_st->q_cur_pos += tag_len + 2;
tag_start = NULL;
tag_len = 0;
break;
}
}
shared_st->q++;
shared_st->q_cur_pos++;
}
// If we didn't find the closing delimiter, wait for more input
if (shared_st->q_cur_pos >= shared_st->q_len)
return st_no_mark_found;
}
}
c = *shared_st->q;
// 3. BRACKET COUNTING
if (c == '[') {
array_st->bracket_depth++;
} else if (c == ']') {
array_st->bracket_depth--;
if (array_st->bracket_depth == 0) {
// End of array literal found
shared_st->res_cur_pos = shared_st->res_pre_pos;
// Replace the whole thing with '?'
*shared_st->res_cur_pos++ = '?';
shared_st->prev_char = '?';
/* Skip closing ']' */
if (shared_st->q_cur_pos < shared_st->q_len) {
shared_st->q++;
shared_st->q_cur_pos++;
}
return st_no_mark_found;
}
}
// Keep same state to continue in next iteration
return next_state;
}
/**
* @brief Process the 'st_literal_prefix' state.
* @details The state 'st_literal_prefix' is responsible of
* skipping the prefix name before the initial string delimiter has been detected.
*
* @param shared_st Shared state used to continue the query processing.
* @param str_st The literal string parsing state, holds the information so far found about the state.
*/
static __attribute__((always_inline)) inline
enum p_st process_literal_prefix_type(shared_st* s, literal_string_st* str_st) {
enum p_st next_state = st_no_mark_found;
const char* q = s->q;
size_t remaining = s->q_len - s->q_cur_pos;
// U&' prefix
if (remaining >= 3 && (q[0] == 'U' || q[0] == 'u') &&
q[1] == '&' && q[2] == '\'') {
str_st->is_unicode = 1;
s->q += 2;
s->q_cur_pos += 2;
next_state = st_literal_prefix_type;
} else if (remaining >= 2 && q[1] == '\'') { // Single-char prefixes
char lower_prefix = (char)tolower(q[0]);
if (lower_prefix == 'e' ||
lower_prefix == 'b' ||
lower_prefix == 'x') {
s->q++;
s->q_cur_pos++;
next_state = st_literal_prefix_type;
}
}
if (next_state == st_literal_prefix_type) {
next_state = process_literal_string(s, str_st);
}
return next_state;
}
/**
* @brief Handles the processing state 'st_quoted_identifier'.
* @details State 'st_quoted_identifier' copies the quoted identifier as-is to the result buffer.
* In PostgreSQL, double quotes are used for quoted identifiers that can contain special characters,
* preserve case, or use reserved words as identifiers.
*
* @param shared_st Shared state used to continue the query processing.
* @param str_st The literal string parsing state, holds the information so far found about the state.
*
* @return The next processing state, it could be either:
* - 'st_quoted_identifier' if the quoted identifier hasn't yet completed to be parsed.
* - 'st_no_mark_found' if the quoted identifier has completed to be parsed.
*/
static __attribute__((always_inline)) inline
enum p_st process_quoted_identifier(shared_st* shared_st, quoted_identifier_st* str_st) {
enum p_st next_state = st_quoted_identifier;
// process the first delimiter
if (str_st->delim_num == 0) {
// store found delimiter
str_st->q_start_pos = shared_st->q;
str_st->delim_char = *shared_st->q; // Should be '"'
str_st->delim_num = 1;
return next_state;
}
// Check for closing quote
if (*shared_st->q == '"') {
// Reset the quoted identifier state
str_st->delim_char = 0;
str_st->delim_num = 0;
str_st->q_start_pos = 0;
// Exit the quoted identifier parsing state
next_state = st_no_mark_found;
}
return next_state;
}
/**
* @brief Gets the 'digest_end' position to be used as the end of character iteration for the currently
* processed stage.
* @details If the stage being processed is a 'compression' stage, i.e, it isn't 'stage 1'. The end of the
* digest for performing the compression *could be* neither the final position in which 'stage 1'
* finalized or the end of the buffer being used to write the digest. If 'stage 1' was parsing a number,
* the position used for the end of the compression stage shall be the position of the starting digit in
* the number being parsed marked by 'stage_1_st->literal_digit_st.start_pos'.
*
* @param shared_st Shared state used to continue the query processing.
* @param stage_1_st The 'stage 1' state used to decide which will be the 'digest_end' position for the
* current stage.
*/
static __attribute__((always_inline)) inline
char* get_stage_digest_end(shared_st* shared_st, stage_1_st* stage_1_st) {
char* digest_end = NULL;
if (shared_st->st == st_literal_number && stage_1_st->literal_dig_st.start_pos != NULL) {
digest_end = stage_1_st->literal_dig_st.start_pos - 1;
} else {
digest_end = shared_st->res_cur_pos - 1;
}
return digest_end;
}
/**
* @brief Sets the new starting position for the current stage being processed.
* @details Sets the new starting position for the current stage to be the supplied 'next_start_pos', only
* if some boundary conditions hold, otherwise, it sets 'res_init_pos' as the new starting position.
*
* Extra details:
* If the current stages processing iteration isn't the first one, the previous iteration of the
* current stage to be processed already performed a compression till a certain position. Iterating the
* whole result buffer again in this iteration is pointless, since most of the buffer should have been
* already compressed by this stage.
*
* @param shared_st Shared state used to continue the query processing.
* @param stage_1_st The 'stage 1' state used to decide which will be the 'digest_end' position for the
* current stage.
*/
static __attribute__((always_inline)) inline
void set_stage_next_start_pos(shared_st* shared_st, char* digest_end, char* next_start_pos) {
bool initial_it = shared_st->res_init_pos == shared_st->res_it_init_pos;
bool valid_next_start_pos = next_start_pos >= shared_st->res_init_pos && next_start_pos < digest_end;
if (initial_it == 0 && valid_next_start_pos) {
shared_st->res_cur_pos = next_start_pos;
shared_st->res_pre_pos = next_start_pos;
} else {
shared_st->res_cur_pos = shared_st->res_init_pos;
shared_st->res_pre_pos = shared_st->res_init_pos;
}
}
/**
* @brief Finalizes the compression stage and updates the supplied stage iteration final position 'stage_pre_it_pos'.
* @details Copies the required characters beyond stage 'digest_end' that haven't been processed by the
* compression stage, like for example, when 'stage 1' was interrupted parsing a digit because the result
* buffer run out of memory.
*
* @param shared_st Shared state used to continue the query processing.
* @param digest_end The computed 'digest_end' for the stage being processed.
* @param stage_1_st The state from the previous iteration of 'stage 1'.
* @param stage_pre_it_pos Pointer to be updated with the final position being processed for compression by
* the stage.
*/
static __attribute__((always_inline)) inline
void end_compression_stage_it(shared_st* shared_st, char* digest_end, stage_1_st* stage_1_st, char** stage_pre_it_pos) {
if (digest_end == stage_1_st->literal_dig_st.start_pos - 1 && stage_1_st->new_end_pos) {
char* f_digits = stage_1_st->literal_dig_st.start_pos;
stage_1_st->literal_dig_st.start_pos = shared_st->res_pre_pos;
*stage_pre_it_pos = stage_1_st->literal_dig_st.start_pos;
while (f_digits < stage_1_st->new_end_pos) {
*shared_st->res_pre_pos++ = *f_digits++;
shared_st->res_cur_pos++;
}
*shared_st->res_pre_pos = 0;
stage_1_st->new_end_pos = shared_st->res_pre_pos;
} else {
*shared_st->res_pre_pos = 0;
*stage_pre_it_pos = shared_st->res_pre_pos;
}
}
/**
* @brief Performs the first stage parsing. This stage replaces values and extra spaces from the query, and
* extracts any 'first comment' found within it.
* @details This parsing stage is responsible for replacing the following elements from the query:
* - String literals.
* - Number literals: Hexadecimal, scientific notation, floating point numbers, regular numbers.
* - NULL literals, if option 'replace_nulls' is supplied.
* - Comments of any class; the first comment found of type '/\**\/' should be retrieved via 'fst_cmnt'
* parameter. If the comment is a 'cmd' comment, it should be copied into the query digest instead of
* being ignored.
* - Leading spaces and double spaces found.
*
* This stage is the unique stage that performs copy of the characters being processed, all the other stages
* perform further compression on the query digest resulted after this stage initial value replacement.
*
* Implementation Details:
* 1. The stage parsing is implemented as an main loop consuming the characters present in the supplied query
* buffer, this iteration stops when all the characters have been consumed or the result buffer is
* exhausted.
* 2. The detection of parsing states is performed by function 'get_next_st', whenever a new parsing state
* is required for parsing current and subsequent chars, the transition to that state happens immediately,
* without consuming current char.
* 3. The state processing functions are responsible for deciding whether or not the characters processed
* during that state are copied into the resulting buffer. For states that doesn't automatically switch back
* to neutral state 'st_no_mark_found', it's *not required* to consume the first digit. Since this will
* automatically takes place at the end of the current iteration.
*
* @param shared_st Shared state used to continue the query processing.
* @param stage_1_st The first stage state to be updated.
* @param opts Options used to homogenize queries via 'lowercase' or 'replace_nulls' options.
* @param fst_cmnt Pointer to be updated with the found first comment, left unmodified otherwise.
*/
static __attribute__((always_inline)) inline
void stage_1_parsing(shared_st* shared_st, stage_1_st* stage_1_st, const options* opts, char** fst_cmnt) {
// state required between different iterations of special parsing states
char* res_final_pos = shared_st->res_init_pos + shared_st->d_max_len - 1;
cmnt_type_1_st* const cmnt_type_1_st = &stage_1_st->cmnt_type_1_st;
literal_string_st* const literal_str_st = &stage_1_st->literal_str_st;
literal_digit_st* const literal_dig_st = &stage_1_st->literal_dig_st;
dollar_quote_string_st* const dollar_quote_str_st = &stage_1_st->dollar_quote_str_st;
pg_typecast_st* const pg_tc_st = &stage_1_st->pg_tc_st;
array_literal_st* const array_st = &stage_1_st->array_st;
quoted_identifier_st* const quoted_identifier_str_st = &stage_1_st->quoted_iden_st;
// starting state can belong to a previous iteration
enum p_st cur_st = shared_st->st;
// if the previous iteration was parsing a number
if (stage_1_st->new_end_pos != NULL) {
shared_st->res_cur_pos = stage_1_st->new_end_pos;
shared_st->res_pre_pos = stage_1_st->new_end_pos;
}
// NOTE: Required for 'digest_corner_cases_3.hjson'
// Space detection can fail when comming from another iteration if 'prev_char' is not reset.
// This can allow to copy the null terminator due to the logic in 'double spaces' supression.
if (shared_st->res_init_pos != shared_st->res_it_init_pos) {
shared_st->prev_char = *(shared_st->res_pre_pos - 1);
}
// Stop when either:
// 1. There is no more room left the result buffer.
// 2. The final position of the received query has been reached.
while (shared_st->res_cur_pos <= res_final_pos && shared_st->q_cur_pos < shared_st->q_len) {
if (cur_st == st_no_mark_found) {
// update the last position over the return buffer to be the current position
shared_st->res_pre_pos = shared_st->res_cur_pos;
cur_st = get_next_st(opts, shared_st);
// if next st isn't 'no_mark_found' transition to it without consuming current char
if (cur_st != st_no_mark_found) {
continue;
} else {
// generic space removal operations
// ================================
// Removal of spaces that doesn't belong to any particular parsing state.
// ignore all the leading spaces
if (shared_st->res_cur_pos == shared_st->res_init_pos && is_space_char(*shared_st->q)) {
shared_st->q++;
shared_st->q_cur_pos++;
continue;
}
// suppress all the double spaces.
// ==============================
//
// The suppression is performed using the address of the second space found as the
// pivoting point for further space suppression in the result buffer:
//
// ```
// Q: `SELECT\s\s 1`
// ^ address used to be replaced by next char
// ```
if (is_space_char(shared_st->prev_char) && is_space_char(*shared_st->q)) {
// if current position in result buffer is the first space found, we move to the next
// position, in order to respect the first space char.
if (!is_space_char(*(shared_st->res_cur_pos-1))) {
shared_st->res_cur_pos++;
}
shared_st->prev_char = ' ';
*shared_st->res_cur_pos = ' ';
shared_st->q++;
shared_st->q_cur_pos++;
continue;
}
// copy the current char
copy_next_char(shared_st, opts);
}
} else {
switch (cur_st) {
case st_cmnt_type_1:
// by default, we don't copy the next char for comments
shared_st->copy_next_char = 0;
cur_st = process_cmnt_type_1(opts, shared_st, cmnt_type_1_st, fst_cmnt);
if (cur_st == st_no_mark_found) {
shared_st->copy_next_char = 1;
continue;
}
break;
case st_cmnt_type_2:
shared_st->copy_next_char = 0;
cur_st = process_cmnt_type_2(shared_st);
if (cur_st == st_no_mark_found) {
shared_st->copy_next_char = 1;
continue;
}
break;
case st_literal_string:
// NOTE: Not required to copy since spaces are not going to be processed here
shared_st->copy_next_char = 0;
cur_st = process_literal_string(shared_st, literal_str_st);
if (cur_st == st_no_mark_found) {
shared_st->copy_next_char = 1;
continue;
}
break;
case st_quoted_identifier:
shared_st->copy_next_char = 1; // We copy characters in this state
cur_st = process_quoted_identifier(shared_st, quoted_identifier_str_st);
if (cur_st == st_no_mark_found) {
shared_st->copy_next_char = 1;
continue;
}
break;
case st_dollar_quote_string:
shared_st->copy_next_char = 0;
cur_st = process_dollar_quote_string(shared_st, dollar_quote_str_st);
if (cur_st == st_no_mark_found) {
shared_st->copy_next_char = 1;
continue;
}
break;
case st_literal_number:
shared_st->copy_next_char = 1;
cur_st = process_literal_digit(shared_st, literal_dig_st, opts);
if (cur_st == st_no_mark_found) {
literal_dig_st->first_digit = 1;
shared_st->copy_next_char = 1;
continue;
}
break;
case st_replace_null:
// shared_st->copy_next_char = 1;
cur_st = process_replace_null(shared_st, opts);
if (cur_st == st_no_mark_found) {
// literal_null_st.null_pos = 0;
shared_st->copy_next_char = 1;
continue;
}
break;
case st_replace_boolean:
// shared_st->copy_next_char = 1;
cur_st = process_replace_boolean(shared_st, opts);
if (cur_st == st_no_mark_found) {
shared_st->copy_next_char = 1;
continue;
}
break;
case st_pg_typecast:
shared_st->copy_next_char = 0;
cur_st = process_pg_typecast(shared_st, pg_tc_st);
if (cur_st == st_no_mark_found) {
shared_st->copy_next_char = 1;
continue;
}
break;
case st_array_literal:
shared_st->copy_next_char = 0;
cur_st = process_array_literal(shared_st, array_st);
if (cur_st == st_no_mark_found) {
shared_st->copy_next_char = 1;
continue;
}
break;
case st_literal_prefix_type:
shared_st->copy_next_char = 0;
cur_st = process_literal_prefix_type(shared_st, literal_str_st);
if (cur_st == st_no_mark_found) {
shared_st->copy_next_char = 1;
continue;
}
break;
default:
break;
}
if (shared_st->copy_next_char) {
copy_next_char(shared_st, opts);
} else {
inc_proc_pos(shared_st);
}
}
}
// place the final null terminator
*shared_st->res_cur_pos = 0;
shared_st->st = cur_st;
// store final state position
stage_1_st->pre_it_pos = shared_st->res_cur_pos;
// if stage isn't finished parsing an element, set the current parsing position at which the last
// element was copied.
if (shared_st->st == st_literal_number) {
stage_1_st->new_end_pos = shared_st->res_cur_pos;
} else {
stage_1_st->new_end_pos = NULL;
}
}
/**
* @brief Performs the second stage parsing. This stage is is already a compression stage responsible of
* removing the following patterns:
* - Spaces after '(', and before ')'.
* - Spaces before and after arithmetic operators.
* - Removal of (+|-) when acting on a single value.
* - When enabled, via 'pgsql_thread___query_digests_no_digits', removal of digits that aren't literals.
*
* @param shared_st Shared state used to continue the query processing.
* @param stage_1_st The state resulting from the previous execution of 'stage 1'.
* @param stage_2_st The state from previous execution of 'stage 2' to be udpated.
* @param opts Options used for deciding wether or not enabling digits replacement.
*/
static __attribute__((always_inline)) inline
void stage_2_parsing(shared_st* shared_st, stage_1_st* stage_1_st, stage_2_st* stage_2_st, const options* opts) {
char* digest_end = get_stage_digest_end(shared_st, stage_1_st);
// Compute the starting point for the second stage. The offset chosen of (5 + 1) is derived from the
// pattern: `? + ddd` where 'd' stands for 'digit'. This pattern could take place in case the first
// stage was interrupted while parsing a digit.
//
// previous_iteration:
//
// ```
// `,? + d`
// ^ first digit 'stage_1' position && last 'stage_2' compression pos
// ```
//
// next_iteration:
//
// ```
// `,? + d`
// ^ next_start pos
// ```
//
// Using an offset of at least `6` should prevent missing patterns in this current iteration.
char* next_start_pos = stage_2_st->pre_it_pos - (shared_st->gl_c_offset - stage_2_st->c_offset) - (5 + 1);
set_stage_next_start_pos(shared_st, digest_end, next_start_pos);
// second stage: Space and (+|-) replacement
while (shared_st->res_cur_pos <= digest_end) {
if (*shared_st->res_cur_pos == ' ') {
char lc = '0';
if (shared_st->res_cur_pos > shared_st->res_init_pos) {
lc = *(shared_st->res_cur_pos-1);
}
char rc = *(shared_st->res_cur_pos+1);
if (lc == '(' || rc == ')') {
shared_st->res_cur_pos++;
} else if ((is_arithmetic_op(lc) && rc == '?') || lc == ',' || rc == ',') {
char llc = '0';
if (shared_st->res_cur_pos > shared_st->res_init_pos + 1) {
llc = *(shared_st->res_cur_pos-2);
}
if (opts->keep_comment && (llc == '*' && lc == '/')) {
*shared_st->res_pre_pos++ = *shared_st->res_cur_pos++;
} else {
shared_st->res_cur_pos++;
}
} else if (is_arithmetic_op(rc) && lc == '?' && is_token_char(lc)) {
shared_st->res_cur_pos++;
} else {
*shared_st->res_pre_pos++ = *shared_st->res_cur_pos++;
}
} else if (*shared_st->res_cur_pos == '+' || *shared_st->res_cur_pos == '-') {
char llc = '0';
if (shared_st->res_cur_pos > shared_st->res_init_pos + 1) {
llc = *(shared_st->res_cur_pos-2);
}
char lc = '0';
if (shared_st->res_cur_pos > shared_st->res_init_pos) {
lc = *(shared_st->res_cur_pos-1);
}
char rc = *(shared_st->res_cur_pos+1);
// patterns to cover:
// - ? + ?
// - ?,+?
// - c +?
// - c + ?
// - c+ ?
// - c+?
// - c, + ?
if (lc == ' ') {
if (is_normal_char(llc)) {
shared_st->res_cur_pos++;
} else if (is_token_char(llc) && (llc != '?' && llc != ')') && (rc == '?' || rc == ' ')) {
shared_st->res_cur_pos++;
} else {
*shared_st->res_pre_pos++ = *shared_st->res_cur_pos++;
}
} else {
if (is_token_char(lc) && (lc != '?' && lc != ')') && (rc == '?' || rc == ' ')) {
shared_st->res_cur_pos++;
} else {
*shared_st->res_pre_pos++ = *shared_st->res_cur_pos++;
}
}
} else if (opts->replace_number == 1 && is_digit_char(*shared_st->res_cur_pos) ) {
if (shared_st->res_pre_pos > shared_st->res_init_pos && *(shared_st->res_pre_pos-1) != '?') {
*shared_st->res_pre_pos++ = '?';
}
shared_st->res_cur_pos++;
} else {
*shared_st->res_pre_pos++ = *shared_st->res_cur_pos++;
}
}
// store this iteration position and compute the compression offset
int c_2_offset = digest_end - shared_st->res_pre_pos + 1;
stage_2_st->c_offset = c_2_offset > 0 ? c_2_offset : 0;
end_compression_stage_it(shared_st, digest_end, stage_1_st, &stage_2_st->pre_it_pos);
shared_st->res_cur_pos = shared_st->res_pre_pos;
}
/**
* @brief Performs the third stage compression. This stage is a compression stage responsible for collapsing
* the value grouping pattern like '(?,?,?)' into '(?,...)' using the config value given by
* 'pgsql_thread___query_digests_grouping_limit'.
*
* @param shared_st Shared state used to continue the query processing.
* @param stage_1_st The state resulting from the previous execution of 'stage 1'.
* @param stage_3_st The state from previous execution of 'stage 2' to be updated.
* @param opts Options used for deciding how to perform the group collapsing.
*/
static __attribute__((always_inline)) inline
void stage_3_parsing(shared_st* shared_st, stage_1_st* stage_1_st, stage_3_st* stage_3_st, const options* opts) {
if (opts->grouping_limit == 0) { return; }
// compute the 'digest_end' for the stage 3
char* digest_end = get_stage_digest_end(shared_st, stage_1_st);
// Compute the starting point for the third stage. The 'min_group_size' value is obtained from
// the following pattern:
//
// previous_iteration - after 'stage_3':
//
// ```
// `(?,?,?,?,?,?,...+ d`
// ^ ^ first digit 'stage_1' position && last 'stage_3' compression pos
// | required min 'next_start_pos' for next 'stage_3'
// ```
//
// The break down is:
// * opts->grouping_limit*2: Maximum number of characters of groups left.
// * 7: sizeof('...+ ') + sizeof('(') + 1.
//
int min_group_size = opts->grouping_limit*2 + 7;
char* next_start_pos =
stage_3_st->pre_it_pos - (shared_st->gl_c_offset - stage_3_st->c_offset) - (min_group_size + 1);
set_stage_next_start_pos(shared_st, digest_end, next_start_pos);
char group_candidate = 0;
// it's a fixed pattern, we can perform a lookahead replacement
while (shared_st->res_cur_pos <= digest_end) {
// If this isn't the first iteration, it's possible to found an expansion pack '...' that is followed
// by characters copied in 'stage_1' during this iteration:
//
// ```
// `(?,?,?,?,?,?,...,?,?)`
// ^ last 'stage_3' compression pos, followed by new: `,?,?)`
// ```
if (group_candidate == 1 && (shared_st->res_pre_pos - shared_st->res_init_pos) > 4) {
char found_exp_pack =
*(shared_st->res_pre_pos-1) == '.' &&
*(shared_st->res_pre_pos-2) == '.' &&
*(shared_st->res_pre_pos-3) == '.' &&
*(shared_st->res_pre_pos-4) == ',';
if (found_exp_pack == 1 && ((digest_end - shared_st->res_cur_pos) >= 1)) {
// collapse new patterns founds after the expansion
char* new_cur_pos = shared_st->res_cur_pos;
bool is_last = 0;
// if the first character is a ',' we skip it to count the '?,' patterns
if (*new_cur_pos == ',') {
new_cur_pos += 1;
}
while ((new_cur_pos < digest_end)) {
if (*new_cur_pos == '?' && *(new_cur_pos+1) == ',') {
new_cur_pos += 2;
} else {
if (*new_cur_pos == '?' && *(new_cur_pos+1) == ')') {
new_cur_pos += 1;
is_last = 1;
}
break;
}
}
// We update the current position if either:
// * At least one '?,' was found.
// * The final pattern '?)' was found.
if ((new_cur_pos > shared_st->res_cur_pos + 1) || is_last) {
shared_st->res_cur_pos = new_cur_pos;
}
// If the first stage hasn't finished parsing a number literal, the following situation is
// possible, since we previously skipped the found ',':
//
// ```
// `(?,?,?,...,dddd)`
// ^ new_cur_pos
// ```
//
// In this case, we break to avoid copying the last char. That copy should be performed by
// `end_compression_stage_it`.
if (stage_1_st->literal_dig_st.start_pos) {
if (new_cur_pos >= digest_end && is_digit_char(*new_cur_pos)) {
break;
}
}
}
}
char* cur_char = shared_st->res_cur_pos;
char pattern_fits = shared_st->res_cur_pos < digest_end - opts->grouping_limit*2;
if (group_candidate == 1 && pattern_fits) {
// NOTE: Minimal viable pattern for replacement is the starting point: '?,?,'.
// This pattern also matches the size of a 32bit register, so probably will only
// take one comparison for matching it. This removes a lot of false cases matching the first
// '?', or '?,' that could be found in column names when digit replacement is performed.
char is_min_pattern =
*cur_char == '?' && *(cur_char+1) == ',' &&
*(cur_char+2) == '?' && (*(cur_char+3) == ',' || *(cur_char+3) == ')');
// The pattern to match shouldn't be preceded by an arithmetic operator, otherwise, patterns
// like this '?+?,?,?' could start counting from the first match of '?,', which shouldn't be
// the case.
if (is_arithmetic_op(*(cur_char-1)) == 0 && is_min_pattern) {
int pattern_len = 0;
char pattern_broken = 0;
char* pattern_pos = shared_st->res_cur_pos;
while ((pattern_pos < digest_end) && pattern_broken == 0) {
if (*pattern_pos == '?' && *(pattern_pos+1) == ',') {
pattern_pos += 2;
pattern_len += 1;
} else {
if (*(pattern_pos+1) == ')') {
pattern_broken = 2;
} else {
pattern_broken = 1;
}
}
}
// in case of the final pattern being '?)', we need to count the '?' as being replaced for
// the grouping for matching replacements of the exact length.
int f_pattern_len = pattern_broken == 2 ? pattern_len * 2 + 1 : pattern_len * 2;
if (f_pattern_len >= (opts->grouping_limit * 2 + 3)) {
for (int i = 0; i < pattern_len; i++) {
if (i < opts->grouping_limit) {
*shared_st->res_pre_pos++ = '?';
*shared_st->res_pre_pos++ = ',';
} else if (i == opts->grouping_limit) {
*shared_st->res_pre_pos++ = '.';
*shared_st->res_pre_pos++ = '.';
*shared_st->res_pre_pos++ = '.';
}
}
// we jump over the final '?' in case the final pattern was '?)'
if (pattern_broken == 2) {
shared_st->res_cur_pos = pattern_pos + 1;
} else {
shared_st->res_cur_pos = pattern_pos - 1;
}
} else {
for (int i = 0; i < pattern_len; i++) {
*shared_st->res_pre_pos++ = '?';
*shared_st->res_pre_pos++ = ',';
}
// Update the current position to the position where pattern was broken
shared_st->res_cur_pos = pattern_pos;
}
} else {
*shared_st->res_pre_pos++ = *shared_st->res_cur_pos++;
}
} else {
*shared_st->res_pre_pos++ = *shared_st->res_cur_pos++;
}
// grouping candidates always start with '('
if (*cur_char == '(') {
group_candidate = 1;
} else if (*cur_char == ')') {
group_candidate = 0;
}
}
int c_3_offset = digest_end - (shared_st->res_pre_pos - 1);
stage_3_st->c_offset = c_3_offset > 0 ? c_3_offset : 0;
end_compression_stage_it(shared_st, digest_end, stage_1_st, &stage_3_st->pre_it_pos);
shared_st->res_cur_pos = shared_st->res_pre_pos;
}
/**
* @brief Check if there is a group pattern of kind '(?,?,?)' following the supplied position.
*
* @param pos The starting pattern position. The initial '('.
* @param opts Options used to compute the pattern length.
*
* @return '1' if a pattern has been found, '0' otherwise.
*/
static inline
bool is_group_pattern(const char* pos, const options* opts) {
int group_size = (1 + opts->grouping_limit*2 + 3 + 1);
bool is_group_pattern = 1;
int i = 0;
for (i = 0; i < group_size; i++) {
if (i == 0) {
if (*pos != '(') {
is_group_pattern = 0;
break;
}
} else if (i == group_size - 1) {
if (*(pos + i) != ')') {
is_group_pattern = 0;
break;
}
} else if (i % 2 == 1) {
if (i <= opts->grouping_limit * 2) {
if (*(pos + i) != '?') {
is_group_pattern = 0;
break;
}
} else {
if (*(pos + i) != '.') {
is_group_pattern = 0;
break;
}
}
} else {
if (i <= opts->grouping_limit * 2) {
if (*(pos + i) != ',') {
is_group_pattern = 0;
break;
}
} else {
if (*(pos + i) != '.') {
is_group_pattern = 0;
break;
}
}
}
}
return is_group_pattern;
}
/**
* @brief Performs the fourth stage compression. This stage is a compression stage responsible for collapsing
* the value grouping patterns already compressed by 'stage 3' into a more compact representation, e.g:
*
* Pattern:
*
* ```
* (?,?,...),(?,?,...),(?,?,...),(?,?,...),(?,?,...)
* ```
*
* For 'pgsql_thread___query_digests_groups_grouping_limit=3' would be compressed into:
*
* ```
* (?,?,...),(?,?,...),(?,?,...),...
* ```
*
* @param shared_st Shared state used to continue the query processing.
* @param stage_1_st The state resulting from the previous execution of 'stage 1'.
* @param stage_4_st The state from previous execution of 'stage 4' to be updated.
* @param opts Options used for deciding how to perform the group collapsing.
*/
static __attribute__((always_inline)) inline
void stage_4_parsing(shared_st* shared_st, stage_1_st* stage_1_st, stage_4_st* stage_4_st, const options* opts) {
if (opts->groups_grouping_limit == 0 || opts->grouping_limit == 0) { return; }
char* digest_end = get_stage_digest_end(shared_st, stage_1_st);
// '( + ?,?,n + ... + ') ,'
int group_pattern_size = (1 + opts->grouping_limit*2 + 3 + 1 + 1);
// Compute the starting point for the fourth stage. Since the previous iteration could have ended in a
// non-collapsed chain of patterns (if the expanded version of last pattern didn't fit in the buffer). The
// last position from the previous iteration could be:
//
// * 'pgsql_thread___query_digests_grouping_limit': 2
// * 'pgsql_thread___query_digests_groups_grouping_limit': 6
//
// ```
// (?,?,...),(?,?,...),(?,?,...),(?,?,...),(?,?,...),(?,?,...),(?,?,+d)
// ^ last position
// ```
//
// Since the '7' pattern was never found, no collapsing took place, so in order to ensure that we lie
// behind the whole pattern for this iteration we use the offset:
//
// ```
// (group_pattern_size * (opts->groups_grouping_limit + 2))
// ```
char* next_start_pos = stage_4_st->pre_it_pos - (group_pattern_size * (opts->groups_grouping_limit + 2));
// compute the starting point for the fourth stage
set_stage_next_start_pos(shared_st, digest_end, next_start_pos);
// it's a fixed pattern, we can perform a lookahead replacement
while (shared_st->res_cur_pos <= digest_end) {
char* cur_char = shared_st->res_cur_pos;
if ((shared_st->res_pre_pos - shared_st->res_init_pos) > 5) {
char found_exp_pack =
*(shared_st->res_pre_pos-1) == '.' &&
*(shared_st->res_pre_pos-2) == '.' &&
*(shared_st->res_pre_pos-3) == '.' &&
*(shared_st->res_pre_pos-4) == ',' &&
*(shared_st->res_pre_pos-5) == ')';
if (found_exp_pack == 1) {
char* cur_pattern_pos = cur_char;
int found_group_patterns = 0;
// Jump over the found comma or space, same as in 'stage_3'. For a specific case regarding
// the space see 'digest_corner_cases_2.hjson' payload.
if (*cur_pattern_pos == ',' || *cur_pattern_pos == ' ') {
cur_pattern_pos += 1;
}
while(cur_pattern_pos + (group_pattern_size - 2) <= digest_end) {
if (is_group_pattern(cur_pattern_pos, opts) == 1) {
if (cur_pattern_pos + (group_pattern_size - 1) == digest_end) {
cur_pattern_pos += group_pattern_size - 1;
} else {
cur_pattern_pos += group_pattern_size;
}
found_group_patterns += 1;
} else {
break;
}
}
if (cur_pattern_pos > shared_st->res_cur_pos + 1) {
shared_st->res_cur_pos = cur_pattern_pos - 1;
continue;
}
if (cur_pattern_pos >= digest_end) {
break;
}
}
}
char pattern_fits =
shared_st->res_cur_pos <=
// NOTE: Final '+ 1' due to repeating comma in the pattern not in the final case, and the
// fact that digest_end is the final character, which is part of the pattern
(digest_end - (group_pattern_size * (opts->groups_grouping_limit + 1)) + 2);
// fast check for knowing that this can potentially be a group pattern
if (pattern_fits && *cur_char == '(' && *(cur_char+1) == '?' && *(cur_char+2) == ',') {
char* pattern_start = cur_char;
char* cur_pattern_pos = cur_char;
int found_group_patterns = 0;
while(cur_pattern_pos + (group_pattern_size - 2) <= digest_end) {
if (is_group_pattern(cur_pattern_pos, opts) == 1) {
cur_pattern_pos += group_pattern_size - 1;
if (*cur_pattern_pos == ',') {
cur_pattern_pos++;
}
found_group_patterns += 1;
} else {
break;
}
}
// count found forward patterns
if (found_group_patterns > opts->groups_grouping_limit) {
memmove(shared_st->res_pre_pos, pattern_start, (long) group_pattern_size * opts->groups_grouping_limit);
shared_st->res_pre_pos += group_pattern_size * opts->groups_grouping_limit;
*shared_st->res_pre_pos++ = '.';
*shared_st->res_pre_pos++ = '.';
*shared_st->res_pre_pos++ = '.';
shared_st->res_cur_pos = cur_pattern_pos;
}
}
if (shared_st->res_cur_pos > digest_end) {
break;
} else {
*shared_st->res_pre_pos++ = *shared_st->res_cur_pos++;
}
}
int c_4_offset = digest_end - (shared_st->res_pre_pos - 1);
stage_4_st->c_offset = c_4_offset > 0 ? c_4_offset : 0;
end_compression_stage_it(shared_st, digest_end, stage_1_st, &stage_4_st->pre_it_pos);
shared_st->res_cur_pos = shared_st->res_pre_pos;
}
/**
* @brief Final stage, reponsible of performing final cleanups to the digest after the rest of the processing
* is performed, at the moment it peforms:
*
* * Final space replacement.
* * Trimmed digits replacement.
*
* @param shared_st Shared state used to continue the query processing.
* @param stage_1_st Stage 1 final state, used for the trimmed digits replacement.
* @param opts Options, currently unused.
*/
static __attribute__((always_inline)) inline
void final_stage(shared_st* shared_st, stage_1_st* stage_1_st, const options* opts) {
// Simple final cleanup for making queries more homogeneous when trimmed.
// Since literal number processing requires the copy of the literal into the output buffer, processing
// could finish before a number is completely parsed, due to compression non being able to create enough
// room to complete the processing. In this case, it's possible to have digest ending like:
//
// ```
// INSERT INTO db.table pi_value VALUES (3.141592
// ^ end because no room for parsing all the digits
// ```
//
// In this case a final effor is performed to homogenize the query, replacing the literal by '?':
//
// ```
// INSERT INTO db.table pi_value VALUES (?
// ^ replaced literal
// ```
if (stage_1_st->literal_dig_st.start_pos != NULL) {
if (shared_st->d_max_len <= (shared_st->res_cur_pos - shared_st->res_init_pos)) {
if (shared_st->st == st_literal_number && is_digit_char(*stage_1_st->literal_dig_st.start_pos)) {
*stage_1_st->literal_dig_st.start_pos++ = '?';
*stage_1_st->literal_dig_st.start_pos = '\0';
}
}
}
// Remove all trailing whitespaces and semicolons
// ==============================================
//
// - Final spaces left by comments which are never collapsed, ex:
//
// ```
// Q: `select 1.1 -- final_comment \n`
// D: `select ? `
// ^ never collapsed
// ```
//
// - Semicolons (';') marking the end of the query are also removed.
{
// v1_crashing_payload_06
char* f_char = shared_st->res_cur_pos - 1;
while (f_char > shared_st->res_init_pos && (*f_char == ' ' || *f_char == ';')) {
f_char--;
}
f_char++;
*f_char = '\0';
// NOTE: Since this is the last operation this isn't really required. But it's left in case this block
// is moved in the future.
shared_st->res_cur_pos = f_char;
}
}
/**
* @brief Parse the supplied query and returns a query digest. Newer implementation based on different parsing
* stages in order to simplify branching and processing logic:
*
* - First stage: Replacing of literal values and double spaces. The goal of this stage is homogenize the
* query values as much as possible to reduce branching in further processing stages.
* - Second stage: Replacing of extra spaces and arithmetic operators (+|-) when they are in front of a
* single value.
* - Third stage: Perform different supported grouping operations for the already replaced values.
*
* @param s The query to be parsed.
* @param len The length of the received query.
* @param fst_cmnt Pointer to store the fst cmnt found in the query, if any.
* @param buf Buffer to use to store the digest for the supplied query, if no buffer is supplied, memory will
* be allocated based on 'pgsql_thread___query_digests_max_query_length' and supplied query length.
*
* @return A pointer to the start of the supplied buffer, or the allocated memory containing the digest.
*/
char* pgsql_query_digest_and_first_comment(const char* const q, int q_len, char** const fst_cmnt, char* const buf, const options* opts) {
#ifdef DEBUG
if (buf != NULL) {
memset(buf, 0, 127);
}
#endif
/* buffer to store first comment. */
int d_max_len = get_digest_max_len(q_len, opts->max_query_length);
char* res = get_result_buffer(d_max_len, buf);
#ifdef DEBUG
res[d_max_len] = 0;
#endif
// state shared between all the parsing states
struct shared_st shared_st;
memset(&shared_st, 0, sizeof(struct shared_st));
init_shared_st(&shared_st, q, q_len, d_max_len, res);
// individual states for stages
struct stage_1_st stage_1_st;
memset(&stage_1_st, 0, sizeof(struct stage_1_st));
init_stage_1_st(&stage_1_st);
struct stage_2_st stage_2_st;
struct stage_3_st stage_3_st;
struct stage_4_st stage_4_st;
memset(&stage_2_st, 0, sizeof(struct stage_2_st));
memset(&stage_3_st, 0, sizeof(struct stage_3_st));
memset(&stage_4_st, 0, sizeof(struct stage_4_st));
char min_digest_size = 0;
// TODO: This may requires a stopping point, configurable or not, otherwise parsing can become slow for
// very big queries that will require multiple compression stages for processing them. Instead if a
// maximum number of iterations is imposed, those queries will stop being parsed before the maximum
// compression, but the overhead can be greatly reduced. Example of these queries can be:
//
// ```
// ˇ Query continues...
// INSERT INTO db.table (colj-1,colk,...) VALUES (?,...),(?,...) ON DUPLICATE KEY UPDATE col1 = VALUES(col2) + VALUES(col3)')
// 'n' number of values ^ ^ 'm' number of repetitions
// ```
//
// If 'n' and 'm' are big numbers, the number of iterations for performing the collapsing would totally be
// dependent of: length(query) / max_query_digest_length. Most of this kind of query, will keep being
// collapsed, since none of the iterations will fill the buffer, since all the new values will be
// collapsed. Due to this, we might want to offer a way or limit to stop the iteration and offer a
// trade off between compression and performance for very big queries.
while (min_digest_size == 0) {
stage_1_parsing(&shared_st, &stage_1_st, opts, fst_cmnt);
stage_2_parsing(&shared_st, &stage_1_st, &stage_2_st, opts);
stage_3_parsing(&shared_st, &stage_1_st, &stage_3_st, opts);
stage_4_parsing(&shared_st, &stage_1_st, &stage_4_st, opts);
// compute the compression offset of the whole iteration
shared_st.gl_c_offset = stage_1_st.pre_it_pos - shared_st.res_cur_pos;
if (
shared_st.q_cur_pos >= shared_st.q_len ||
d_max_len <= (shared_st.res_cur_pos - shared_st.res_init_pos) ||
shared_st.gl_c_offset == 0
) {
min_digest_size = 1;
} else {
// we need to update the shared state for processing again from the previous ending point
char* new_start_point = shared_st.res_cur_pos;
shared_st.res_it_init_pos = new_start_point;
shared_st.res_cur_pos = new_start_point;
shared_st.res_pre_pos = new_start_point;
}
}
final_stage(&shared_st, &stage_1_st, opts);
return res;
}
static __attribute__((always_inline)) inline
enum p_st process_literal_string_space_rm(shared_st* shared_st, literal_string_st* str_st) {
enum p_st next_state = st_literal_string;
// process the first delimiter
if (str_st->delim_num == 0) {
str_st->delim_char = *shared_st->q;
str_st->delim_num = 1;
// TODO: Remove exp space replacement
*shared_st->res_cur_pos++ = *shared_st->q;
// consume the delimiter from the query
shared_st->q++;
shared_st->q_cur_pos++;
}
// need to be ignored case
if(shared_st->res_cur_pos > shared_st->res_pre_pos + SIZECHAR)
{
if(
(shared_st->prev_char == '\\' && *shared_st->q == '\\') || // to process '\\\\', '\\'
(shared_st->prev_char == '\\' && *shared_st->q == str_st->delim_char) || // to process '\''
(shared_st->prev_char == str_st->delim_char && *shared_st->q == str_st->delim_char) // to process ''''
)
{
shared_st->prev_char = 'X';
shared_st->q++;
shared_st->q_cur_pos++;
return next_state;
}
}
// satisfied closing string - swap string to ?
if(
*shared_st->q == str_st->delim_char &&
(shared_st->d_max_len == shared_st->q_cur_pos+1 || *(shared_st->q + SIZECHAR) != str_st->delim_char)
) {
shared_st->res_cur_pos = shared_st->res_pre_pos;
char* _p = shared_st->res_pre_pos - 3;
// remove '+|-' symbols before the found literal
if ( _p >= shared_st->res_init_pos && ( *(_p+2) == '-' || *(_p+2) == '+') ) {
if (
( *(_p+1) == ',' ) || ( *(_p+1) == '(' ) ||
( ( *(_p+1) == ' ' ) && ( *_p == ',' || *_p == '(' ) )
) {
shared_st->res_cur_pos--;
}
}
// remove spaces before the found literal
if ( _p >= shared_st->res_init_pos && is_space_char(*(_p + 2))) {
if (
( *(_p+1) == ',' ) || ( *(_p+1) == '(' ) || ( is_arithmetic_op(*(_p+1)) )
) {
if ( _p >= shared_st->res_init_pos && ( *(_p+3) == '\''|| *(_p+3) == '"' )) {
shared_st->res_cur_pos--;
}
}
}
// place the replacement mark
*shared_st->res_cur_pos++ = '?';
shared_st->prev_char = '?';
// don't copy this char if last
if (shared_st->d_max_len == shared_st->q_cur_pos + 1) {
shared_st->copy_next_char = 0;
// keep the same state, no token was found
return next_state;
}
// reinit the string literal state
str_st->delim_char = 0;
str_st->delim_num = 0;
// update the shared state
shared_st->prev_char = str_st->delim_char;
if(shared_st->q_cur_pos < shared_st->d_max_len) {
shared_st->q++;
}
shared_st->q_cur_pos++;
// exit the literal parsing state
next_state = st_no_mark_found;
}
return next_state;
}
static __attribute__((always_inline)) inline
enum p_st process_literal_digit_space_rm(shared_st* shared_st, literal_digit_st* digit_st, options* opts) {
enum p_st next_state = st_literal_number;
// consume the first digit
if (digit_st->first_digit == 1 && is_token_char(*(shared_st->q-1)) && is_digit_char(*shared_st->q)) {
// place the previous position at the number start
*shared_st->res_cur_pos++ = *shared_st->q;
digit_st->first_digit = 0;
shared_st->q++;
shared_st->q_cur_pos++;
}
// is float
if (
*shared_st->q == '.' || (*shared_st->q == 'e' || *shared_st->q == 'E') ||
(
(*shared_st->q == '+' || *shared_st->q == '-') &&
(shared_st->prev_char == 'e' || shared_st->prev_char == 'E')
)
) {
shared_st->prev_char = *shared_st->q;
shared_st->copy_next_char = 0;
return next_state;
}
// token char or last char
if (is_token_char(*shared_st->q) || shared_st->d_max_len == shared_st->q_cur_pos + 1) {
if (is_digit_string(shared_st->res_pre_pos, shared_st->res_cur_pos)) {
shared_st->res_cur_pos = shared_st->res_pre_pos;
char* _p = shared_st->res_pre_pos - 3;
// remove symbol and keep parenthesis or comma
if (_p >= shared_st->res_init_pos && ( *(_p+2) == '-' || *(_p+2) == '+') ) {
if (
( *(_p+1) == ',' ) || (*(_p+1) == '(') ||
( (*(_p+1) == ' ') && (*_p == ',' || *_p == '(') )
) {
shared_st->res_cur_pos--;
}
}
// Remove spaces before number counting with possible '.' presence
if (_p >= shared_st->res_init_pos && *_p == '.' &&
(*(_p+1) == ' ' || *(_p+1) == '.') &&
(*(_p+2) == '-' || *(_p+2) == '+')
) {
if (*(_p + 1) == ' ') {
shared_st->res_cur_pos--;
}
shared_st->res_cur_pos--;
}
// remove spaces after a opening bracket when followed by a number
if (_p >= shared_st->res_init_pos && *(_p+1) == '(' && *(_p+2) == ' ') {
shared_st->res_cur_pos--;
}
// remove spaces before number
if (_p >= shared_st->res_init_pos && is_space_char(*(_p + 2))) {
// a point '.' can be found prior to a number in case of query grouping
if ( _p >= shared_st->res_init_pos &&
(*(_p+1) == '-' || *(_p+1) == '+' || *(_p+1) == '*' || *(_p+1) == '/' ||
*(_p+1) == '%' || *(_p+1) == ',' || *(_p+1) == '.')
) {
shared_st->res_cur_pos--;
}
}
// place the replacement mark
*shared_st->res_cur_pos++ = '?';
shared_st->prev_char = '?';
// don't copy this char if last
if (shared_st->d_max_len == shared_st->q_cur_pos + 1) {
shared_st->copy_next_char = 0;
// keep the same state, no token was found
return next_state;
}
} else {
// collapse any digits found in the string
if (opts->replace_number) {
int str_len = shared_st->res_cur_pos - shared_st->res_pre_pos + 1;
int collapsed = 0;
for (int i = 0; i < str_len; i++) {
char* const c_p_r_t = ((char*)shared_st->res_pre_pos + i);
char* const n_p_r_t = ((char*)shared_st->res_pre_pos + i + 1);
if (is_digit_char(*c_p_r_t) && is_digit_char(*n_p_r_t)) {
memmove(c_p_r_t, c_p_r_t + 1, str_len - i);
collapsed += 1;
}
}
shared_st->res_cur_pos -= collapsed;
int new_str_len = shared_st->res_cur_pos - shared_st->res_pre_pos + 1;
for (int i = 0; i < new_str_len; i++) {
char* const c_p_r_t = ((char*)shared_st->res_cur_pos + i);
if (is_digit_char(*c_p_r_t)) {
*c_p_r_t = '?';
}
}
}
}
next_state = st_no_mark_found;
}
return next_state;
}
char* pgsql_query_strip_comments(char *s, int _len, bool lowercase) {
int i = 0;
int len = _len;
char *r = (char *) malloc(len + SIZECHAR);
char *p_r = r;
char *p_r_t = r;
char prev_char = 0;
char flag = 0;
char fns=0;
while(i < len)
{
// =================================================
// START - read token char and set flag what's going on.
// =================================================
if(flag == 0)
{
// store current position
p_r_t = p_r;
// comment type 1 - start with '/*'
if(prev_char == '/' && *s == '*')
{
flag = 1;
}
// comment type 2 - start with '--'
else if(prev_char == '-' && *s == '-')
{
flag = 2;
}
// not above case - remove duplicated space char
else
{
flag = 0;
if (fns==0 && is_space_char(*s)) {
s++;
i++;
continue;
}
if (fns==0) fns=1;
if(is_space_char(prev_char) && is_space_char(*s)){
prev_char = ' ';
*p_r = ' ';
s++;
i++;
continue;
}
}
}
// =================================================
// PROCESS and FINISH - do something on each case
// =================================================
else
{
// --------
// comment
// --------
if(
// comment type 1 - /* .. */
(flag == 1 && prev_char == '*' && *s == '/')
||
// comment type 2 - --... \n
(flag == 2 && (*s == '\n' || *s == '\r' || (i == len -1) ))
)
{
p_r = p_r_t;
if (flag == 1 || (i == len -1)) {
p_r -= SIZECHAR;
}
prev_char = ' ';
flag = 0;
s++;
i++;
continue;
}
}
// =================================================
// COPY CHAR
// =================================================
// convert every space char to ' '
if (lowercase==false) {
*p_r++ = !is_space_char(*s) ? *s : ' ';
} else {
*p_r++ = !is_space_char(*s) ? (tolower(*s)) : ' ';
}
prev_char = *s++;
i++;
}
// remove a trailing space
if (p_r>r) {
char *e=p_r;
e--;
if (*e==' ') {
*e=0;
}
}
*p_r = 0;
return r;
}
char* pgsql_query_digest_first_stage(const char* const q, int q_len, char** const fst_cmnt, char* const buf) {
/* buffer to store first comment. */
int d_max_len = get_digest_max_len(q_len, pgsql_thread___query_digests_max_query_length);
char* res = get_result_buffer(d_max_len, buf);
// global options
options opts;
get_pgsql_options(&opts);
// state shared between all the parsing states
struct shared_st shared_st;
memset(&shared_st, 0, sizeof(struct shared_st));
init_shared_st(&shared_st, q, q_len, d_max_len, res);
struct stage_1_st stage_1_st;
memset(&stage_1_st, 0, sizeof(struct stage_1_st));
init_stage_1_st(&stage_1_st);
// perform just the first stage parsing
stage_1_parsing(&shared_st, &stage_1_st, &opts, fst_cmnt);
final_stage(&shared_st, &stage_1_st, &opts);
return res;
}
char* pgsql_query_digest_second_stage(const char* const q, int q_len, char** const fst_cmnt, char* const buf) {
/* buffer to store first comment. */
int d_max_len = get_digest_max_len(q_len, pgsql_thread___query_digests_max_query_length);
char* res = get_result_buffer(d_max_len, buf);
// global options
options opts;
get_pgsql_options(&opts);
// state shared between all the parsing states
struct shared_st shared_st;
memset(&shared_st, 0, sizeof(struct shared_st));
init_shared_st(&shared_st, q, q_len, d_max_len, res);
struct stage_1_st stage_1_st;
memset(&stage_1_st, 0, sizeof(struct stage_1_st));
init_stage_1_st(&stage_1_st);
struct stage_2_st stage_2_st;
memset(&stage_2_st, 0, sizeof(struct stage_2_st));
// perform just the first stage parsing
stage_1_parsing(&shared_st, &stage_1_st, &opts, fst_cmnt);
// second stage parsing
stage_2_parsing(&shared_st, &stage_1_st, &stage_2_st, &opts);
final_stage(&shared_st, &stage_1_st, &opts);
return res;
}
char* pgsql_query_digest_and_first_comment_2(const char* const q, int q_len, char** const fst_cmnt, char* const buf) {
// global options
options opts;
get_pgsql_options(&opts);
return pgsql_query_digest_and_first_comment(q, q_len, fst_cmnt, buf, &opts);
}
char* pgsql_query_digest_and_first_comment_one_it(char* q, int q_len, char** fst_cmnt, char* buf) {
#ifdef DEBUG
if (buf != NULL) {
memset(buf, 0, 127);
}
#endif
int d_max_len = get_digest_max_len(q_len, pgsql_thread___query_digests_max_query_length);
char* res = get_result_buffer(d_max_len, buf);
// global options
options opts;
get_pgsql_options(&opts);
// state shared between all the parsing states
struct shared_st shared_st;
memset(&shared_st, 0, sizeof(struct shared_st));
shared_st.q = q;
shared_st.q_len = q_len;
shared_st.d_max_len = d_max_len;
shared_st.res_init_pos = res;
shared_st.res_it_init_pos = res;
shared_st.res_cur_pos = res;
shared_st.res_pre_pos = res;
// state required between different iterations of special parsing states
struct cmnt_type_1_st c_t_1_st;
struct literal_string_st literal_str_st;
struct literal_digit_st literal_digit_st;
struct dollar_quote_string_st dollar_str_st;
struct pg_typecast_st typecast_st;
struct array_literal_st array_st;
struct quoted_identifier_st quoted_iden_st;
memset(&c_t_1_st, 0, sizeof(struct cmnt_type_1_st));
memset(&literal_str_st, 0, sizeof(struct literal_string_st));
memset(&literal_digit_st, 0, sizeof(struct literal_digit_st));
memset(&dollar_str_st, 0, sizeof(struct dollar_quote_string_st));
memset(&typecast_st, 0, sizeof(struct pg_typecast_st));
memset(&array_st, 0, sizeof(struct array_literal_st));
memset(&quoted_iden_st, 0, sizeof(struct quoted_identifier_st));
enum p_st cur_st = st_no_mark_found;
// start char consumption
while (shared_st.q_cur_pos < d_max_len) {
if (cur_st == st_no_mark_found) {
// update the last position over the return buffer to be the current position
shared_st.res_pre_pos = shared_st.res_cur_pos;
cur_st = get_next_st(&opts, &shared_st);
// if next st isn't 'no_mark_found' transition to it without consuming current char
if (cur_st != st_no_mark_found) {
continue;
}
else {
// generic space removal operations
// ================================
// Removal of spaces that doesn't belong to any particular parsing state.
// ignore all the leading spaces
if (shared_st.res_cur_pos == shared_st.res_init_pos && is_space_char(*shared_st.q)) {
shared_st.q++;
shared_st.q_cur_pos++;
continue;
}
// suppress all the double spaces.
// ==============================
//
// The suppression is performed using the address of the second space found as the
// pivoting point for further space suppression in the result buffer:
//
// ```
// Q: `SELECT\s\s 1`
// ^ address used to be replaced by next char
// ```
if (is_space_char(shared_st.prev_char) && is_space_char(*shared_st.q)) {
// if current position in result buffer is the first space found, we move to the next
// position, in order to respect the first space char.
if (!is_space_char(*(shared_st.res_cur_pos - 1))) {
shared_st.res_cur_pos++;
}
shared_st.prev_char = ' ';
*shared_st.res_cur_pos = ' ';
shared_st.q++;
shared_st.q_cur_pos++;
continue;
}
{
char* p = shared_st.res_cur_pos - 2;
// suppress spaces before arithmetic operators
if (p >= shared_st.res_init_pos && is_space_char(shared_st.prev_char) && is_arithmetic_op(*shared_st.q)) {
if (*p == '?') {
shared_st.prev_char = *shared_st.q;
--shared_st.res_cur_pos;
*shared_st.res_cur_pos++ = *shared_st.q;
shared_st.q++;
shared_st.q_cur_pos++;
continue;
}
}
// suppress spaces before and after commas
if (
p >= shared_st.res_init_pos && is_space_char(shared_st.prev_char) &&
((*shared_st.q == ',') || (*p == ','))
) {
if (*shared_st.q == ',') {
--shared_st.res_cur_pos;
*shared_st.res_cur_pos++ = *shared_st.q;
shared_st.prev_char = ',';
shared_st.q++;
shared_st.q_cur_pos++;
}
else {
shared_st.prev_char = ',';
--shared_st.res_cur_pos;
}
continue;
}
// suppress spaces before closing brackets when grouping or mark is present
if (
p >= shared_st.res_init_pos && (*p == '.' || *p == '?') &&
is_space_char(shared_st.prev_char) && (*shared_st.q == ')')
) {
shared_st.prev_char = *shared_st.q;
--shared_st.res_cur_pos;
*shared_st.res_cur_pos++ = *shared_st.q;
shared_st.q++;
shared_st.q_cur_pos++;
continue;
}
}
// copy the current char
copy_next_char(&shared_st, &opts);
}
} else {
if (cur_st == st_cmnt_type_1) {
// by default, we don't copy the next char for comments
shared_st.copy_next_char = 0;
cur_st = process_cmnt_type_1(&opts, &shared_st, &c_t_1_st, fst_cmnt);
if (cur_st == st_no_mark_found) {
shared_st.copy_next_char = 1;
continue;
}
} else if (cur_st == st_cmnt_type_2) {
shared_st.copy_next_char = 0;
cur_st = process_cmnt_type_2(&shared_st);
if (cur_st == st_no_mark_found) {
shared_st.copy_next_char = 1;
continue;
}
} else if (cur_st == st_literal_string) {
shared_st.copy_next_char = 1;
cur_st = process_literal_string_space_rm(&shared_st, &literal_str_st);
if (cur_st == st_no_mark_found) {
shared_st.copy_next_char = 1;
continue;
}
} else if (cur_st == st_literal_number) {
shared_st.copy_next_char = 1;
cur_st = process_literal_digit_space_rm(&shared_st, &literal_digit_st, &opts);
if (cur_st == st_no_mark_found) {
literal_digit_st.first_digit = 1;
shared_st.copy_next_char = 1;
continue;
}
} else if (cur_st == st_dollar_quote_string) {
shared_st.copy_next_char = 1;
cur_st = process_dollar_quote_string(&shared_st, &dollar_str_st);
if (cur_st == st_no_mark_found) {
shared_st.copy_next_char = 1;
continue;
}
}
if (shared_st.copy_next_char) {
copy_next_char(&shared_st, &opts);
}
else {
// if we do not copy we skip the next char, but copy it to `prev_char`
shared_st.prev_char = *shared_st.q++;
shared_st.q_cur_pos++;
}
}
}
// remove all trailing whitespaces
// ===============================
//
// Final spaces left by comments which are never collapsed, ex:
//
// ```
// Q: `select 1.1 -- final_comment \n`
// D: `select ? `
// ^ never collapsed
// ```
if (shared_st.res_cur_pos > shared_st.res_it_init_pos) {
char* wspace = shared_st.res_cur_pos - 1;
while (*wspace == ' ') {
wspace--;
}
wspace++;
*wspace = '\0';
}
// place the final null terminator
*shared_st.res_cur_pos = 0;
return res;
}