diff --git a/src/parser.c b/src/parser.c index 74c9c3e..7844602 100644 --- a/src/parser.c +++ b/src/parser.c @@ -1,7 +1,6 @@ #include "tree_sitter/parser.h" #if defined(__GNUC__) || defined(__clang__) -#pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmissing-field-initializers" #endif @@ -40287,10 +40286,12 @@ unsigned tree_sitter_yaml_external_scanner_serialize(void *, char *); void tree_sitter_yaml_external_scanner_deserialize(void *, const char *, unsigned); #ifdef _WIN32 -#define extern __declspec(dllexport) +#define TS_PUBLIC __declspec(dllexport) +#else +#define TS_PUBLIC __attribute__((visibility("default"))) #endif -extern const TSLanguage *tree_sitter_yaml(void) { +TS_PUBLIC const TSLanguage *tree_sitter_yaml() { static const TSLanguage language = { .version = LANGUAGE_VERSION, .symbol_count = SYMBOL_COUNT, diff --git a/src/scanner.c b/src/scanner.c new file mode 100644 index 0000000..35e32a9 --- /dev/null +++ b/src/scanner.c @@ -0,0 +1,1371 @@ +#include "tree_sitter/array.h" +#include "tree_sitter/parser.h" + +#include "./schema.generated.c" +#include + +// clang-format off + +typedef enum { + END_OF_FILE, + + S_DIR_YML_BGN, R_DIR_YML_VER, + S_DIR_TAG_BGN, R_DIR_TAG_HDL, R_DIR_TAG_PFX, + S_DIR_RSV_BGN, R_DIR_RSV_PRM, + S_DRS_END, + S_DOC_END, + R_BLK_SEQ_BGN, BR_BLK_SEQ_BGN, B_BLK_SEQ_BGN, + R_BLK_KEY_BGN, BR_BLK_KEY_BGN, B_BLK_KEY_BGN, + R_BLK_VAL_BGN, BR_BLK_VAL_BGN, B_BLK_VAL_BGN, + R_BLK_IMP_BGN, + R_BLK_LIT_BGN, BR_BLK_LIT_BGN, + R_BLK_FLD_BGN, BR_BLK_FLD_BGN, + BR_BLK_STR_CTN, + R_FLW_SEQ_BGN, BR_FLW_SEQ_BGN, B_FLW_SEQ_BGN, + R_FLW_SEQ_END, BR_FLW_SEQ_END, + R_FLW_MAP_BGN, BR_FLW_MAP_BGN, B_FLW_MAP_BGN, + R_FLW_MAP_END, BR_FLW_MAP_END, + R_FLW_SEP_BGN, BR_FLW_SEP_BGN, + R_FLW_KEY_BGN, BR_FLW_KEY_BGN, + R_FLW_JSV_BGN, BR_FLW_JSV_BGN, + R_FLW_NJV_BGN, BR_FLW_NJV_BGN, + R_DQT_STR_BGN, BR_DQT_STR_BGN, B_DQT_STR_BGN, + R_DQT_STR_CTN, BR_DQT_STR_CTN, + R_DQT_ESC_NWL, BR_DQT_ESC_NWL, + R_DQT_ESC_SEQ, BR_DQT_ESC_SEQ, + R_DQT_STR_END, BR_DQT_STR_END, + R_SQT_STR_BGN, BR_SQT_STR_BGN, B_SQT_STR_BGN, + R_SQT_STR_CTN, BR_SQT_STR_CTN, + R_SQT_ESC_SQT, BR_SQT_ESC_SQT, + R_SQT_STR_END, BR_SQT_STR_END, + + R_SGL_PLN_NUL_BLK, BR_SGL_PLN_NUL_BLK, B_SGL_PLN_NUL_BLK, R_SGL_PLN_NUL_FLW, BR_SGL_PLN_NUL_FLW, + R_SGL_PLN_BOL_BLK, BR_SGL_PLN_BOL_BLK, B_SGL_PLN_BOL_BLK, R_SGL_PLN_BOL_FLW, BR_SGL_PLN_BOL_FLW, + R_SGL_PLN_INT_BLK, BR_SGL_PLN_INT_BLK, B_SGL_PLN_INT_BLK, R_SGL_PLN_INT_FLW, BR_SGL_PLN_INT_FLW, + R_SGL_PLN_FLT_BLK, BR_SGL_PLN_FLT_BLK, B_SGL_PLN_FLT_BLK, R_SGL_PLN_FLT_FLW, BR_SGL_PLN_FLT_FLW, + R_SGL_PLN_STR_BLK, BR_SGL_PLN_STR_BLK, B_SGL_PLN_STR_BLK, R_SGL_PLN_STR_FLW, BR_SGL_PLN_STR_FLW, + + R_MTL_PLN_STR_BLK, BR_MTL_PLN_STR_BLK, + R_MTL_PLN_STR_FLW, BR_MTL_PLN_STR_FLW, + + R_TAG, BR_TAG, B_TAG, + R_ACR_BGN, BR_ACR_BGN, B_ACR_BGN, R_ACR_CTN, + R_ALS_BGN, BR_ALS_BGN, B_ALS_BGN, R_ALS_CTN, + + BL, + COMMENT, +} TokenType; + +// clang-format on + +#define SCN_SUCC 1 +#define SCN_STOP 0 +#define SCN_FAIL (-1) + +#define IND_ROT 'r' +#define IND_MAP 'm' +#define IND_SEQ 'q' +#define IND_STR 's' + +#define RET_SYM(RESULT_SYMBOL) \ + { \ + flush(scanner); \ + lexer->result_symbol = RESULT_SYMBOL; \ + return true; \ + } + +#define POP_IND() \ + { \ + /* incorrect status caused by error recovering */ \ + if (scanner->ind_typ_stk.size == 1) { \ + return false; \ + } \ + pop_ind(scanner); \ + } + +#define PUSH_IND(TYP, LEN) push_ind(scanner, TYP, LEN) + +#define PUSH_BGN_IND(TYP) \ + { \ + if (has_tab_ind) \ + return false; \ + push_ind(scanner, TYP, bgn_col); \ + } + +#define MAY_PUSH_IMP_IND(TYP) \ + { \ + if (cur_ind != scanner->blk_imp_col) { \ + if (scanner->blk_imp_tab) \ + return false; \ + push_ind(scanner, IND_MAP, scanner->blk_imp_col); \ + } \ + } + +#define MAY_PUSH_SPC_SEQ_IND() \ + { \ + if (cur_ind_typ == IND_MAP) { \ + push_ind(scanner, IND_SEQ, bgn_col); \ + } \ + } + +#define MAY_UPD_IMP_COL() \ + { \ + if (scanner->blk_imp_row != bgn_row) { \ + scanner->blk_imp_row = bgn_row; \ + scanner->blk_imp_col = bgn_col; \ + scanner->blk_imp_tab = has_tab_ind; \ + } \ + } + +#define SGL_PLN_SYM(POS, CTX) \ + (scanner->rlt_sch == RS_NUL ? POS##_SGL_PLN_NUL_##CTX \ + : scanner->rlt_sch == RS_BOL ? POS##_SGL_PLN_BOL_##CTX \ + : scanner->rlt_sch == RS_INT ? POS##_SGL_PLN_INT_##CTX \ + : scanner->rlt_sch == RS_FLT ? POS##_SGL_PLN_FLT_##CTX \ + : POS##_SGL_PLN_STR_##CTX) + +static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } + +static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } + +typedef struct { + int16_t row; + int16_t col; + int16_t blk_imp_row; + int16_t blk_imp_col; + int16_t blk_imp_tab; + Array(int16_t) ind_typ_stk; + Array(int16_t) ind_len_stk; + + // temp + int16_t end_row; + int16_t end_col; + int16_t cur_row; + int16_t cur_col; + int32_t cur_chr; + int8_t sch_stt; + ResultSchema rlt_sch; +} Scanner; + +unsigned serialize(Scanner *scanner, char *buffer) { + size_t i = 0; + buffer[i++] = (char)scanner->row; + buffer[i++] = (char)scanner->col; + buffer[i++] = (char)scanner->blk_imp_row; + buffer[i++] = (char)scanner->blk_imp_col; + buffer[i++] = (char)scanner->blk_imp_tab; + int16_t *typ_itr = scanner->ind_typ_stk.contents + 1; + int16_t *typ_end = scanner->ind_typ_stk.contents + scanner->ind_typ_stk.size; + int16_t *len_itr = scanner->ind_len_stk.contents + 1; + for (; typ_itr != typ_end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++typ_itr, ++len_itr) { + buffer[i++] = (char)*typ_itr; + buffer[i++] = (char)*len_itr; + } + return i; +} + +void deserialize(Scanner *scanner, const char *buffer, unsigned length) { + scanner->row = 0; + scanner->col = 0; + scanner->blk_imp_row = -1; + scanner->blk_imp_col = -1; + scanner->blk_imp_tab = 0; + array_delete(&scanner->ind_typ_stk); + array_push(&scanner->ind_typ_stk, IND_ROT); + array_delete(&scanner->ind_len_stk); + array_push(&scanner->ind_len_stk, -1); + if (length > 0) { + size_t i = 0; + scanner->row = (int16_t)buffer[i++]; + scanner->col = (int16_t)buffer[i++]; + scanner->blk_imp_row = (int16_t)buffer[i++]; + scanner->blk_imp_col = (int16_t)buffer[i++]; + scanner->blk_imp_tab = (int16_t)buffer[i++]; + while (i < length) { + array_push(&scanner->ind_typ_stk, (int16_t)buffer[i++]); + array_push(&scanner->ind_len_stk, (int16_t)buffer[i++]); + } + } +} + +void adv(Scanner *scanner, TSLexer *lexer) { + scanner->cur_col++; + scanner->cur_chr = lexer->lookahead; + advance(lexer); +} + +void adv_nwl(Scanner *scanner, TSLexer *lexer) { + scanner->cur_row++; + scanner->cur_col = 0; + scanner->cur_chr = lexer->lookahead; + advance(lexer); +} + +void skp(Scanner *scanner, TSLexer *lexer) { + scanner->cur_col++; + scanner->cur_chr = lexer->lookahead; + skip(lexer); +} + +void skp_nwl(Scanner *scanner, TSLexer *lexer) { + scanner->cur_row++; + scanner->cur_col = 0; + scanner->cur_chr = lexer->lookahead; + skip(lexer); +} + +void mrk_end(Scanner *scanner, TSLexer *lexer) { + scanner->end_row = scanner->cur_row; + scanner->end_col = scanner->cur_col; + lexer->mark_end(lexer); +} + +void init(Scanner *scanner) { + scanner->cur_row = scanner->row; + scanner->cur_col = scanner->col; + scanner->cur_chr = 0; + scanner->sch_stt = 0; + scanner->rlt_sch = RS_STR; +} + +void flush(Scanner *scanner) { + scanner->row = scanner->end_row; + scanner->col = scanner->end_col; +} + +void pop_ind(Scanner *scanner) { + array_pop(&scanner->ind_len_stk); + array_pop(&scanner->ind_typ_stk); +} + +void push_ind(Scanner *scanner, int16_t typ, int16_t len) { + array_push(&scanner->ind_len_stk, len); + array_push(&scanner->ind_typ_stk, typ); +} + +bool is_wsp(int32_t c) { return c == ' ' || c == '\t'; } + +bool is_nwl(int32_t c) { return c == '\r' || c == '\n'; } + +bool is_wht(int32_t c) { return is_wsp(c) || is_nwl(c) || c == 0; } + +bool is_ns_dec_digit(int32_t c) { return c >= '0' && c <= '9'; } + +bool is_ns_hex_digit(int32_t c) { return is_ns_dec_digit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); } + +bool is_ns_word_char(int32_t c) { + return c == '-' || (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + +bool is_nb_json(int32_t c) { return c == 0x09 || (c >= 0x20 && c <= 0x10ffff); } + +bool is_nb_double_char(int32_t c) { return is_nb_json(c) && c != '\\' && c != '"'; } + +bool is_nb_single_char(int32_t c) { return is_nb_json(c) && c != '\''; } + +bool is_ns_char(int32_t c) { + return (c >= 0x21 && c <= 0x7e) || c == 0x85 || (c >= 0xa0 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xfefe) || + (c >= 0xff00 && c <= 0xfffd) || (c >= 0x10000 && c <= 0x10ffff); +} + +bool is_c_indicator(int32_t c) { + return c == '-' || c == '?' || c == ':' || c == ',' || c == '[' || c == ']' || c == '{' || c == '}' || c == '#' || + c == '&' || c == '*' || c == '!' || c == '|' || c == '>' || c == '\'' || c == '"' || c == '%' || c == '@' || + c == '`'; +} + +bool is_c_flow_indicator(int32_t c) { return c == ',' || c == '[' || c == ']' || c == '{' || c == '}'; } + +bool is_plain_safe_in_block(int32_t c) { return is_ns_char(c); } + +bool is_plain_safe_in_flow(int32_t c) { return is_ns_char(c) && !is_c_flow_indicator(c); } + +bool is_ns_uri_char(int32_t c) { + return is_ns_word_char(c) || c == '#' || c == ';' || c == '/' || c == '?' || c == ':' || c == '@' || c == '&' || + c == '=' || c == '+' || c == '$' || c == ',' || c == '_' || c == '.' || c == '!' || c == '~' || c == '*' || + c == '\'' || c == '(' || c == ')' || c == '[' || c == ']'; +} + +bool is_ns_tag_char(int32_t c) { + return is_ns_word_char(c) || c == '#' || c == ';' || c == '/' || c == '?' || c == ':' || c == '@' || c == '&' || + c == '=' || c == '+' || c == '$' || c == '_' || c == '.' || c == '~' || c == '*' || c == '\'' || c == '(' || + c == ')'; +} + +bool is_ns_anchor_char(int32_t c) { return is_ns_char(c) && !is_c_flow_indicator(c); } + +char scn_uri_esc(Scanner *scanner, TSLexer *lexer) { + if (lexer->lookahead != '%') { + return SCN_STOP; + } + mrk_end(scanner, lexer); + adv(scanner, lexer); + if (!is_ns_hex_digit(lexer->lookahead)) { + return SCN_FAIL; + } + adv(scanner, lexer); + if (!is_ns_hex_digit(lexer->lookahead)) { + return SCN_FAIL; + } + adv(scanner, lexer); + return SCN_SUCC; +} + +char scn_ns_uri_char(Scanner *scanner, TSLexer *lexer) { + if (is_ns_uri_char(lexer->lookahead)) { + adv(scanner, lexer); + return SCN_SUCC; + } + return scn_uri_esc(scanner, lexer); +} + +char scn_ns_tag_char(Scanner *scanner, TSLexer *lexer) { + if (is_ns_tag_char(lexer->lookahead)) { + adv(scanner, lexer); + return SCN_SUCC; + } + return scn_uri_esc(scanner, lexer); +} + +bool scn_dir_bgn(Scanner *scanner, TSLexer *lexer) { + adv(scanner, lexer); + if (lexer->lookahead == 'Y') { + adv(scanner, lexer); + if (lexer->lookahead == 'A') { + adv(scanner, lexer); + if (lexer->lookahead == 'M') { + adv(scanner, lexer); + if (lexer->lookahead == 'L') { + adv(scanner, lexer); + if (is_wht(lexer->lookahead)) { + mrk_end(scanner, lexer); + RET_SYM(S_DIR_YML_BGN); + } + } + } + } + } else if (lexer->lookahead == 'T') { + adv(scanner, lexer); + if (lexer->lookahead == 'A') { + adv(scanner, lexer); + if (lexer->lookahead == 'G') { + adv(scanner, lexer); + if (is_wht(lexer->lookahead)) { + mrk_end(scanner, lexer); + RET_SYM(S_DIR_TAG_BGN); + } + } + } + } + for (;;) { + if (!is_ns_char(lexer->lookahead)) { + break; + } + adv(scanner, lexer); + } + if (scanner->cur_col > 1 && is_wht(lexer->lookahead)) { + mrk_end(scanner, lexer); + RET_SYM(S_DIR_RSV_BGN); + } + return false; +} + +bool scn_dir_yml_ver(Scanner *scanner, TSLexer *lexer, TSSymbol result_symbol) { + uint16_t n1 = 0; + uint16_t n2 = 0; + while (is_ns_dec_digit(lexer->lookahead)) { + adv(scanner, lexer); + n1++; + } + if (lexer->lookahead != '.') { + return false; + } + adv(scanner, lexer); + while (is_ns_dec_digit(lexer->lookahead)) { + adv(scanner, lexer); + n2++; + } + if (n1 == 0 || n2 == 0) { + return false; + } + mrk_end(scanner, lexer); + RET_SYM(result_symbol); +} + +bool scn_tag_hdl_tal(Scanner *scanner, TSLexer *lexer) { + if (lexer->lookahead == '!') { + adv(scanner, lexer); + return true; + } + uint16_t n = 0; + while (is_ns_word_char(lexer->lookahead)) { + adv(scanner, lexer); + n++; + } + if (n == 0) { + return true; + } + if (lexer->lookahead == '!') { + adv(scanner, lexer); + return true; + } + return false; +} + +bool scn_dir_tag_hdl(Scanner *scanner, TSLexer *lexer, TSSymbol result_symbol) { + if (lexer->lookahead == '!') { + adv(scanner, lexer); + if (scn_tag_hdl_tal(scanner, lexer)) { + mrk_end(scanner, lexer); + RET_SYM(result_symbol); + } + } + return false; +} + +bool scn_dir_tag_pfx(Scanner *scanner, TSLexer *lexer, TSSymbol result_symbol) { + if (lexer->lookahead == '!') { + adv(scanner, lexer); + } else if (scn_ns_tag_char(scanner, lexer) == SCN_SUCC) { + ; + } else { + return false; + } + for (;;) { + switch (scn_ns_uri_char(scanner, lexer)) { + case SCN_STOP: + mrk_end(scanner, lexer); + case SCN_FAIL: + RET_SYM(result_symbol); + default: + break; + } + } +} + +bool scn_dir_rsv_prm(Scanner *scanner, TSLexer *lexer, TSSymbol result_symbol) { + if (!is_ns_char(lexer->lookahead)) { + return false; + } + adv(scanner, lexer); + while (is_ns_char(lexer->lookahead)) { + adv(scanner, lexer); + } + mrk_end(scanner, lexer); + RET_SYM(result_symbol); +} + +bool scn_tag(Scanner *scanner, TSLexer *lexer, TSSymbol result_symbol) { + if (lexer->lookahead != '!') { + return false; + } + adv(scanner, lexer); + if (is_wht(lexer->lookahead)) { + mrk_end(scanner, lexer); + RET_SYM(result_symbol); + } + if (lexer->lookahead == '<') { + adv(scanner, lexer); + if (scn_ns_uri_char(scanner, lexer) != SCN_SUCC) { + return false; + } + for (;;) { + switch (scn_ns_uri_char(scanner, lexer)) { + case SCN_STOP: + if (lexer->lookahead == '>') { + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(result_symbol); + } + case SCN_FAIL: + return false; + default: + break; + } + } + } else { + if (scn_tag_hdl_tal(scanner, lexer) && scn_ns_tag_char(scanner, lexer) != SCN_SUCC) { + return false; + } + for (;;) { + switch (scn_ns_tag_char(scanner, lexer)) { + case SCN_STOP: + mrk_end(scanner, lexer); + case SCN_FAIL: + RET_SYM(result_symbol); + default: + break; + } + } + } + return false; +} + +bool scn_acr_bgn(Scanner *scanner, TSLexer *lexer, TSSymbol result_symbol) { + if (lexer->lookahead != '&') { + return false; + } + adv(scanner, lexer); + if (!is_ns_anchor_char(lexer->lookahead)) { + return false; + } + mrk_end(scanner, lexer); + RET_SYM(result_symbol); +} + +bool scn_acr_ctn(Scanner *scanner, TSLexer *lexer, TSSymbol result_symbol) { + while (is_ns_anchor_char(lexer->lookahead)) { + adv(scanner, lexer); + } + mrk_end(scanner, lexer); + RET_SYM(result_symbol); +} + +bool scn_als_bgn(Scanner *scanner, TSLexer *lexer, TSSymbol result_symbol) { + if (lexer->lookahead != '*') { + return false; + } + adv(scanner, lexer); + if (!is_ns_anchor_char(lexer->lookahead)) { + return false; + } + mrk_end(scanner, lexer); + RET_SYM(result_symbol); +} + +bool scn_als_ctn(Scanner *scanner, TSLexer *lexer, TSSymbol result_symbol) { + while (is_ns_anchor_char(lexer->lookahead)) { + adv(scanner, lexer); + } + mrk_end(scanner, lexer); + RET_SYM(result_symbol); +} + +bool scn_dqt_esc_seq(Scanner *scanner, TSLexer *lexer, TSSymbol result_symbol) { + uint16_t i; + switch (lexer->lookahead) { + case '0': + case 'a': + case 'b': + case 't': + case '\t': + case 'n': + case 'v': + case 'r': + case 'e': + case ' ': + case '"': + case '/': + case '\\': + case 'N': + case '_': + case 'L': + case 'P': + adv(scanner, lexer); + break; + case 'U': + adv(scanner, lexer); + for (i = 0; i < 8; i++) { + if (is_ns_hex_digit(lexer->lookahead)) { + adv(scanner, lexer); + } else { + return false; + } + } + break; + case 'u': + adv(scanner, lexer); + for (i = 0; i < 4; i++) { + if (is_ns_hex_digit(lexer->lookahead)) { + adv(scanner, lexer); + } else { + return false; + } + } + break; + case 'x': + adv(scanner, lexer); + for (i = 0; i < 2; i++) { + if (is_ns_hex_digit(lexer->lookahead)) { + adv(scanner, lexer); + } else { + return false; + } + } + break; + default: + return false; + } + mrk_end(scanner, lexer); + RET_SYM(result_symbol); +} + +bool scn_drs_doc_end(Scanner *scanner, TSLexer *lexer) { + if (lexer->lookahead != '-' && lexer->lookahead != '.') { + return false; + } + int32_t delimeter = lexer->lookahead; + adv(scanner, lexer); + if (lexer->lookahead == delimeter) { + adv(scanner, lexer); + if (lexer->lookahead == delimeter) { + adv(scanner, lexer); + if (is_wht(lexer->lookahead)) { + return true; + } + } + } + mrk_end(scanner, lexer); + return false; +} + +bool scn_dqt_str_cnt(Scanner *scanner, TSLexer *lexer, TSSymbol result_symbol) { + if (!is_nb_double_char(lexer->lookahead)) { + return false; + } + if (scanner->cur_col == 0 && scn_drs_doc_end(scanner, lexer)) { + mrk_end(scanner, lexer); + RET_SYM(scanner->cur_chr == '-' ? S_DRS_END : S_DOC_END); + } else { + adv(scanner, lexer); + } + while (is_nb_double_char(lexer->lookahead)) { + adv(scanner, lexer); + } + mrk_end(scanner, lexer); + RET_SYM(result_symbol); +} + +bool scn_sqt_str_cnt(Scanner *scanner, TSLexer *lexer, TSSymbol result_symbol) { + if (!is_nb_single_char(lexer->lookahead)) { + return false; + } + if (scanner->cur_col == 0 && scn_drs_doc_end(scanner, lexer)) { + mrk_end(scanner, lexer); + RET_SYM(scanner->cur_chr == '-' ? S_DRS_END : S_DOC_END); + } else { + adv(scanner, lexer); + } + while (is_nb_single_char(lexer->lookahead)) { + adv(scanner, lexer); + } + mrk_end(scanner, lexer); + RET_SYM(result_symbol); +} + +bool scn_blk_str_bgn(Scanner *scanner, TSLexer *lexer, TSSymbol result_symbol) { + if (lexer->lookahead != '|' && lexer->lookahead != '>') { + return false; + } + adv(scanner, lexer); + int16_t cur_ind = *array_back(&scanner->ind_len_stk); + int16_t ind = -1; + if (lexer->lookahead >= '1' && lexer->lookahead <= '9') { + ind = lexer->lookahead - '1'; + adv(scanner, lexer); + if (lexer->lookahead == '+' || lexer->lookahead == '-') { + adv(scanner, lexer); + } + } else if (lexer->lookahead == '+' || lexer->lookahead == '-') { + adv(scanner, lexer); + if (lexer->lookahead >= '1' && lexer->lookahead <= '9') { + ind = lexer->lookahead - '1'; + adv(scanner, lexer); + } + } + if (!is_wht(lexer->lookahead)) { + return false; + } + mrk_end(scanner, lexer); + if (ind != -1) { + ind += cur_ind; + } else { + ind = cur_ind; + while (is_wsp(lexer->lookahead)) { + adv(scanner, lexer); + } + if (lexer->lookahead == '#') { + adv(scanner, lexer); + while (!is_nwl(lexer->lookahead) && lexer->lookahead != 0) { + adv(scanner, lexer); + } + } + if (is_nwl(lexer->lookahead)) { + adv_nwl(scanner, lexer); + } + while (lexer->lookahead != 0) { + if (lexer->lookahead == ' ') { + adv(scanner, lexer); + } else if (is_nwl(lexer->lookahead)) { + if (scanner->cur_col - 1 < ind) { + break; + } + ind = scanner->cur_col - 1; + adv_nwl(scanner, lexer); + } else { + if (scanner->cur_col - 1 > ind) { + ind = scanner->cur_col - 1; + } + break; + } + } + } + PUSH_IND(IND_STR, ind); + RET_SYM(result_symbol); +} + +bool scn_blk_str_cnt(Scanner *scanner, TSLexer *lexer, TSSymbol result_symbol) { + if (!is_ns_char(lexer->lookahead)) { + return false; + } + if (scanner->cur_col == 0 && scn_drs_doc_end(scanner, lexer)) { + POP_IND(); + RET_SYM(BL); + } else { + adv(scanner, lexer); + } + mrk_end(scanner, lexer); + for (;;) { + if (is_ns_char(lexer->lookahead)) { + adv(scanner, lexer); + while (is_ns_char(lexer->lookahead)) { + adv(scanner, lexer); + } + mrk_end(scanner, lexer); + } + if (is_wsp(lexer->lookahead)) { + adv(scanner, lexer); + while (is_wsp(lexer->lookahead)) { + adv(scanner, lexer); + } + } else { + break; + } + } + RET_SYM(result_symbol); +} + +char scn_pln_cnt(Scanner *scanner, TSLexer *lexer, bool (*is_plain_safe)(int32_t)) { + bool is_cur_wsp = is_wsp(scanner->cur_chr); + bool is_cur_saf = is_plain_safe(scanner->cur_chr); + bool is_lka_wsp = is_wsp(lexer->lookahead); + bool is_lka_saf = is_plain_safe(lexer->lookahead); + if (is_lka_saf || is_lka_wsp) { + for (;;) { + if (is_lka_saf && lexer->lookahead != '#' && lexer->lookahead != ':') { + adv(scanner, lexer); + mrk_end(scanner, lexer); + scanner->sch_stt = adv_sch_stt(scanner->sch_stt, scanner->cur_chr, &scanner->rlt_sch); + } else if (is_cur_saf && lexer->lookahead == '#') { + adv(scanner, lexer); + mrk_end(scanner, lexer); + scanner->sch_stt = adv_sch_stt(scanner->sch_stt, scanner->cur_chr, &scanner->rlt_sch); + } else if (is_lka_wsp) { + adv(scanner, lexer); + scanner->sch_stt = adv_sch_stt(scanner->sch_stt, scanner->cur_chr, &scanner->rlt_sch); + } else if (lexer->lookahead == ':') { + adv(scanner, lexer); // check later + } else { + break; + } + + is_cur_wsp = is_lka_wsp; + is_cur_saf = is_lka_saf; + is_lka_wsp = is_wsp(lexer->lookahead); + is_lka_saf = is_plain_safe(lexer->lookahead); + + if (scanner->cur_chr == ':') { + if (is_lka_saf) { + mrk_end(scanner, lexer); + scanner->sch_stt = adv_sch_stt(scanner->sch_stt, scanner->cur_chr, &scanner->rlt_sch); + } else { + return SCN_FAIL; + } + } + } + } else { + return SCN_STOP; + } + return SCN_SUCC; +} + +bool scan(Scanner *scanner, TSLexer *lexer, const bool *valid_symbols) { + init(scanner); + mrk_end(scanner, lexer); + + bool allow_comment = !(valid_symbols[R_DQT_STR_CTN] || valid_symbols[BR_DQT_STR_CTN] || + valid_symbols[R_SQT_STR_CTN] || valid_symbols[BR_SQT_STR_CTN]); + int16_t *ind_ptr = scanner->ind_len_stk.contents + scanner->ind_len_stk.size - 1; + int16_t *ind_end = scanner->ind_len_stk.contents; + int16_t cur_ind = *ind_ptr--; + int16_t x = *ind_ptr; + int16_t prt_ind = ind_ptr == ind_end ? -1 : *ind_ptr; + int16_t cur_ind_typ = *array_back(&scanner->ind_typ_stk); + + bool has_tab_ind = false; + int16_t leading_spaces = 0; + + for (;;) { + if (lexer->lookahead == ' ') { + if (!has_tab_ind) { + leading_spaces++; + } + skp(scanner, lexer); + } else if (lexer->lookahead == '\t') { + has_tab_ind = true; + skp(scanner, lexer); + } else if (is_nwl(lexer->lookahead)) { + has_tab_ind = false; + leading_spaces = 0; + skp_nwl(scanner, lexer); + } else if (allow_comment && lexer->lookahead == '#') { + if (valid_symbols[BR_BLK_STR_CTN] && valid_symbols[BL] && scanner->cur_col <= cur_ind) { + POP_IND(); + RET_SYM(BL); + } + if (valid_symbols[BR_BLK_STR_CTN] + ? scanner->cur_row == scanner->row + : scanner->cur_col == 0 || scanner->cur_row != scanner->row || scanner->cur_col > scanner->col) { + adv(scanner, lexer); + while (!is_nwl(lexer->lookahead) && lexer->lookahead != 0) { + adv(scanner, lexer); + } + mrk_end(scanner, lexer); + RET_SYM(COMMENT); + } else { + break; + } + } else { + break; + } + } + + if (lexer->lookahead == 0) { + if (valid_symbols[BL]) { + mrk_end(scanner, lexer); + POP_IND(); + RET_SYM(BL) + } + if (valid_symbols[END_OF_FILE]) { + mrk_end(scanner, lexer); + RET_SYM(END_OF_FILE) + } + return false; + } + + int16_t bgn_row = scanner->cur_row; + int16_t bgn_col = scanner->cur_col; + int32_t bgn_chr = lexer->lookahead; + + if (valid_symbols[BL] && bgn_col <= cur_ind && !has_tab_ind) { + if (cur_ind == prt_ind && cur_ind_typ == IND_SEQ ? bgn_col < cur_ind || lexer->lookahead != '-' + : bgn_col <= prt_ind || cur_ind_typ == IND_STR) { + POP_IND(); + RET_SYM(BL); + } + } + + bool has_nwl = scanner->cur_row > scanner->row; + bool is_r = !has_nwl; + bool is_br = has_nwl && leading_spaces > cur_ind; + bool is_b = has_nwl && leading_spaces == cur_ind && !has_tab_ind; + bool is_s = bgn_col == 0; + + if (valid_symbols[R_DIR_YML_VER] && is_r) { + return scn_dir_yml_ver(scanner, lexer, R_DIR_YML_VER); + } + if (valid_symbols[R_DIR_TAG_HDL] && is_r) { + return scn_dir_tag_hdl(scanner, lexer, R_DIR_TAG_HDL); + } + if (valid_symbols[R_DIR_TAG_PFX] && is_r) { + return scn_dir_tag_pfx(scanner, lexer, R_DIR_TAG_PFX); + } + if (valid_symbols[R_DIR_RSV_PRM] && is_r) { + return scn_dir_rsv_prm(scanner, lexer, R_DIR_RSV_PRM); + } + if (valid_symbols[BR_BLK_STR_CTN] && is_br && scn_blk_str_cnt(scanner, lexer, BR_BLK_STR_CTN)) { + return true; + } + + if ((valid_symbols[R_DQT_STR_CTN] && is_r && scn_dqt_str_cnt(scanner, lexer, R_DQT_STR_CTN)) || + (valid_symbols[BR_DQT_STR_CTN] && is_br && scn_dqt_str_cnt(scanner, lexer, BR_DQT_STR_CTN))) { + return true; + } + + if ((valid_symbols[R_SQT_STR_CTN] && is_r && scn_sqt_str_cnt(scanner, lexer, R_SQT_STR_CTN)) || + (valid_symbols[BR_SQT_STR_CTN] && is_br && scn_sqt_str_cnt(scanner, lexer, BR_SQT_STR_CTN))) { + return true; + } + + if (valid_symbols[R_ACR_CTN] && is_r) { + return scn_acr_ctn(scanner, lexer, R_ACR_CTN); + } + if (valid_symbols[R_ALS_CTN] && is_r) { + return scn_als_ctn(scanner, lexer, R_ALS_CTN); + } + + if (lexer->lookahead == '%') { + if (valid_symbols[S_DIR_YML_BGN] && is_s) { + return scn_dir_bgn(scanner, lexer); + } + } else if (lexer->lookahead == '*') { + if (valid_symbols[R_ALS_BGN] && is_r) { + MAY_UPD_IMP_COL(); + return scn_als_bgn(scanner, lexer, R_ALS_BGN); + } + if (valid_symbols[BR_ALS_BGN] && is_br) { + MAY_UPD_IMP_COL(); + return scn_als_bgn(scanner, lexer, BR_ALS_BGN); + } + if (valid_symbols[B_ALS_BGN] && is_b) { + MAY_UPD_IMP_COL(); + return scn_als_bgn(scanner, lexer, B_ALS_BGN); + } + } else if (lexer->lookahead == '&') { + if (valid_symbols[R_ACR_BGN] && is_r) { + MAY_UPD_IMP_COL(); + return scn_acr_bgn(scanner, lexer, R_ACR_BGN); + } + if (valid_symbols[BR_ACR_BGN] && is_br) { + MAY_UPD_IMP_COL(); + return scn_acr_bgn(scanner, lexer, BR_ACR_BGN); + } + if (valid_symbols[B_ACR_BGN] && is_b) { + MAY_UPD_IMP_COL(); + return scn_acr_bgn(scanner, lexer, B_ACR_BGN); + } + } else if (lexer->lookahead == '!') { + if (valid_symbols[R_TAG] && is_r) { + MAY_UPD_IMP_COL(); + return scn_tag(scanner, lexer, R_TAG); + } + if (valid_symbols[BR_TAG] && is_br) { + MAY_UPD_IMP_COL(); + return scn_tag(scanner, lexer, BR_TAG); + } + if (valid_symbols[B_TAG] && is_b) { + MAY_UPD_IMP_COL(); + return scn_tag(scanner, lexer, B_TAG); + } + } else if (lexer->lookahead == '[') { + if (valid_symbols[R_FLW_SEQ_BGN] && is_r) { + MAY_UPD_IMP_COL(); + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(R_FLW_SEQ_BGN) + } + if (valid_symbols[BR_FLW_SEQ_BGN] && is_br) { + MAY_UPD_IMP_COL(); + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(BR_FLW_SEQ_BGN) + } + if (valid_symbols[B_FLW_SEQ_BGN] && is_b) { + MAY_UPD_IMP_COL(); + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(B_FLW_SEQ_BGN) + } + } else if (lexer->lookahead == ']') { + if (valid_symbols[R_FLW_SEQ_END] && is_r) { + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(R_FLW_SEQ_END) + } + if (valid_symbols[BR_FLW_SEQ_END] && is_br) { + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(BR_FLW_SEQ_END) + } + } else if (lexer->lookahead == '{') { + if (valid_symbols[R_FLW_MAP_BGN] && is_r) { + MAY_UPD_IMP_COL(); + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(R_FLW_MAP_BGN) + } + if (valid_symbols[BR_FLW_MAP_BGN] && is_br) { + MAY_UPD_IMP_COL(); + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(BR_FLW_MAP_BGN) + } + if (valid_symbols[B_FLW_MAP_BGN] && is_b) { + MAY_UPD_IMP_COL(); + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(B_FLW_MAP_BGN) + } + } else if (lexer->lookahead == '}') { + if (valid_symbols[R_FLW_MAP_END] && is_r) { + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(R_FLW_MAP_END) + } + if (valid_symbols[BR_FLW_MAP_END] && is_br) { + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(BR_FLW_MAP_END) + } + } else if (lexer->lookahead == ',') { + if (valid_symbols[R_FLW_SEP_BGN] && is_r) { + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(R_FLW_SEP_BGN) + } + if (valid_symbols[BR_FLW_SEP_BGN] && is_br) { + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(BR_FLW_SEP_BGN) + } + } else if (lexer->lookahead == '"') { + if (valid_symbols[R_DQT_STR_BGN] && is_r) { + MAY_UPD_IMP_COL(); + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(R_DQT_STR_BGN) + } + if (valid_symbols[BR_DQT_STR_BGN] && is_br) { + MAY_UPD_IMP_COL(); + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(BR_DQT_STR_BGN) + } + if (valid_symbols[B_DQT_STR_BGN] && is_b) { + MAY_UPD_IMP_COL(); + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(B_DQT_STR_BGN) + } + if (valid_symbols[R_DQT_STR_END] && is_r) { + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(R_DQT_STR_END) + } + if (valid_symbols[BR_DQT_STR_END] && is_br) { + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(BR_DQT_STR_END) + } + } else if (lexer->lookahead == '\'') { + if (valid_symbols[R_SQT_STR_BGN] && is_r) { + MAY_UPD_IMP_COL(); + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(R_SQT_STR_BGN) + } + if (valid_symbols[BR_SQT_STR_BGN] && is_br) { + MAY_UPD_IMP_COL(); + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(BR_SQT_STR_BGN) + } + if (valid_symbols[B_SQT_STR_BGN] && is_b) { + MAY_UPD_IMP_COL(); + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(B_SQT_STR_BGN) + } + if (valid_symbols[R_SQT_STR_END] && is_r) { + adv(scanner, lexer); + if (lexer->lookahead == '\'') { + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(R_SQT_ESC_SQT) + } else { + mrk_end(scanner, lexer); + RET_SYM(R_SQT_STR_END) + } + } + if (valid_symbols[BR_SQT_STR_END] && is_br) { + adv(scanner, lexer); + if (lexer->lookahead == '\'') { + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(BR_SQT_ESC_SQT) + } else { + mrk_end(scanner, lexer); + RET_SYM(BR_SQT_STR_END) + } + } + } else if (lexer->lookahead == '?') { + bool is_r_blk_key_bgn = valid_symbols[R_BLK_KEY_BGN] && is_r; + bool is_br_blk_key_bgn = valid_symbols[BR_BLK_KEY_BGN] && is_br; + bool is_b_blk_key_bgn = valid_symbols[B_BLK_KEY_BGN] && is_b; + bool is_r_flw_key_bgn = valid_symbols[R_FLW_KEY_BGN] && is_r; + bool is_br_flw_key_bgn = valid_symbols[BR_FLW_KEY_BGN] && is_br; + if (is_r_blk_key_bgn || is_br_blk_key_bgn || is_b_blk_key_bgn || is_r_flw_key_bgn || is_br_flw_key_bgn) { + adv(scanner, lexer); + if (is_wht(lexer->lookahead)) { + mrk_end(scanner, lexer); + if (is_r_blk_key_bgn) { + PUSH_BGN_IND(IND_MAP); + RET_SYM(R_BLK_KEY_BGN); + } + if (is_br_blk_key_bgn) { + PUSH_BGN_IND(IND_MAP); + RET_SYM(BR_BLK_KEY_BGN); + } + if (is_b_blk_key_bgn) + RET_SYM(B_BLK_KEY_BGN); + if (is_r_flw_key_bgn) + RET_SYM(R_FLW_KEY_BGN); + if (is_br_flw_key_bgn) + RET_SYM(BR_FLW_KEY_BGN); + } + } + } else if (lexer->lookahead == ':') { + if (valid_symbols[R_FLW_JSV_BGN] && is_r) { + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(R_FLW_JSV_BGN); + } + if (valid_symbols[BR_FLW_JSV_BGN] && is_br) { + adv(scanner, lexer); + mrk_end(scanner, lexer); + RET_SYM(BR_FLW_JSV_BGN); + } + bool is_r_blk_val_bgn = valid_symbols[R_BLK_VAL_BGN] && is_r; + bool is_br_blk_val_bgn = valid_symbols[BR_BLK_VAL_BGN] && is_br; + bool is_b_blk_val_bgn = valid_symbols[B_BLK_VAL_BGN] && is_b; + bool is_r_blk_imp_bgn = valid_symbols[R_BLK_IMP_BGN] && is_r; + bool is_r_flw_njv_bgn = valid_symbols[R_FLW_NJV_BGN] && is_r; + bool is_br_flw_njv_bgn = valid_symbols[BR_FLW_NJV_BGN] && is_br; + if (is_r_blk_val_bgn || is_br_blk_val_bgn || is_b_blk_val_bgn || is_r_blk_imp_bgn || is_r_flw_njv_bgn || + is_br_flw_njv_bgn) { + adv(scanner, lexer); + bool is_lka_wht = is_wht(lexer->lookahead); + if (is_lka_wht) { + if (is_r_blk_val_bgn) { + PUSH_BGN_IND(IND_MAP); + mrk_end(scanner, lexer); + RET_SYM(R_BLK_VAL_BGN); + } + if (is_br_blk_val_bgn) { + PUSH_BGN_IND(IND_MAP); + mrk_end(scanner, lexer); + RET_SYM(BR_BLK_VAL_BGN); + } + if (is_b_blk_val_bgn) { + mrk_end(scanner, lexer); + RET_SYM(B_BLK_VAL_BGN); + } + if (is_r_blk_imp_bgn) { + MAY_PUSH_IMP_IND(); + mrk_end(scanner, lexer); + RET_SYM(R_BLK_IMP_BGN); + } + } + if (is_lka_wht || lexer->lookahead == ',' || lexer->lookahead == ']' || lexer->lookahead == '}') { + if (is_r_flw_njv_bgn) { + mrk_end(scanner, lexer); + RET_SYM(R_FLW_NJV_BGN); + } + if (is_br_flw_njv_bgn) { + mrk_end(scanner, lexer); + RET_SYM(BR_FLW_NJV_BGN); + } + } + } + } else if (lexer->lookahead == '-') { + bool is_r_blk_seq_bgn = valid_symbols[R_BLK_SEQ_BGN] && is_r; + bool is_br_blk_seq_bgn = valid_symbols[BR_BLK_SEQ_BGN] && is_br; + bool is_b_blk_seq_bgn = valid_symbols[B_BLK_SEQ_BGN] && is_b; + bool is_s_drs_end = is_s; + if (is_r_blk_seq_bgn || is_br_blk_seq_bgn || is_b_blk_seq_bgn || is_s_drs_end) { + adv(scanner, lexer); + if (is_wht(lexer->lookahead)) { + if (is_r_blk_seq_bgn) { + PUSH_BGN_IND(IND_SEQ); + mrk_end(scanner, lexer); + RET_SYM(R_BLK_SEQ_BGN) + } + if (is_br_blk_seq_bgn) { + PUSH_BGN_IND(IND_SEQ); + mrk_end(scanner, lexer); + RET_SYM(BR_BLK_SEQ_BGN) + } + if (is_b_blk_seq_bgn) { + MAY_PUSH_SPC_SEQ_IND(); + mrk_end(scanner, lexer); + RET_SYM(B_BLK_SEQ_BGN) + } + } else if (lexer->lookahead == '-' && is_s_drs_end) { + adv(scanner, lexer); + if (lexer->lookahead == '-') { + adv(scanner, lexer); + if (is_wht(lexer->lookahead)) { + if (valid_symbols[BL]) { + POP_IND(); + RET_SYM(BL); + } + mrk_end(scanner, lexer); + RET_SYM(S_DRS_END); + } + } + } + } + } else if (lexer->lookahead == '.') { + if (is_s) { + adv(scanner, lexer); + if (lexer->lookahead == '.') { + adv(scanner, lexer); + if (lexer->lookahead == '.') { + adv(scanner, lexer); + if (is_wht(lexer->lookahead)) { + if (valid_symbols[BL]) { + POP_IND(); + RET_SYM(BL); + } + mrk_end(scanner, lexer); + RET_SYM(S_DOC_END); + } + } + } + } + } else if (lexer->lookahead == '\\') { + bool is_r_dqt_esc_nwl = valid_symbols[R_DQT_ESC_NWL] && is_r; + bool is_br_dqt_esc_nwl = valid_symbols[BR_DQT_ESC_NWL] && is_br; + bool is_r_dqt_esc_seq = valid_symbols[R_DQT_ESC_SEQ] && is_r; + bool is_br_dqt_esc_seq = valid_symbols[BR_DQT_ESC_SEQ] && is_br; + if (is_r_dqt_esc_nwl || is_br_dqt_esc_nwl || is_r_dqt_esc_seq || is_br_dqt_esc_seq) { + adv(scanner, lexer); + if (is_nwl(lexer->lookahead)) { + if (is_r_dqt_esc_nwl) { + mrk_end(scanner, lexer); + RET_SYM(R_DQT_ESC_NWL) + } + if (is_br_dqt_esc_nwl) { + mrk_end(scanner, lexer); + RET_SYM(BR_DQT_ESC_NWL) + } + } + if (is_r_dqt_esc_seq) { + return scn_dqt_esc_seq(scanner, lexer, R_DQT_ESC_SEQ); + } + if (is_br_dqt_esc_seq) { + return scn_dqt_esc_seq(scanner, lexer, BR_DQT_ESC_SEQ); + } + return false; + } + } else if (lexer->lookahead == '|') { + if (valid_symbols[R_BLK_LIT_BGN] && is_r) { + return scn_blk_str_bgn(scanner, lexer, R_BLK_LIT_BGN); + } + if (valid_symbols[BR_BLK_LIT_BGN] && is_br) { + return scn_blk_str_bgn(scanner, lexer, BR_BLK_LIT_BGN); + } + } else if (lexer->lookahead == '>') { + if (valid_symbols[R_BLK_FLD_BGN] && is_r) { + return scn_blk_str_bgn(scanner, lexer, R_BLK_FLD_BGN); + } + if (valid_symbols[BR_BLK_FLD_BGN] && is_br) { + return scn_blk_str_bgn(scanner, lexer, BR_BLK_FLD_BGN); + } + } + + bool maybe_sgl_pln_blk = (valid_symbols[R_SGL_PLN_STR_BLK] && is_r) || + (valid_symbols[BR_SGL_PLN_STR_BLK] && is_br) || (valid_symbols[B_SGL_PLN_STR_BLK] && is_b); + bool maybe_sgl_pln_flw = (valid_symbols[R_SGL_PLN_STR_FLW] && is_r) || (valid_symbols[BR_SGL_PLN_STR_FLW] && is_br); + bool maybe_mtl_pln_blk = (valid_symbols[R_MTL_PLN_STR_BLK] && is_r) || (valid_symbols[BR_MTL_PLN_STR_BLK] && is_br); + bool maybe_mtl_pln_flw = (valid_symbols[R_MTL_PLN_STR_FLW] && is_r) || (valid_symbols[BR_MTL_PLN_STR_FLW] && is_br); + + if (maybe_sgl_pln_blk || maybe_sgl_pln_flw || maybe_mtl_pln_blk || maybe_mtl_pln_flw) { + bool is_in_blk = maybe_sgl_pln_blk || maybe_mtl_pln_blk; + bool (*is_plain_safe)(int32_t) = is_in_blk ? is_plain_safe_in_block : is_plain_safe_in_flow; + if (scanner->cur_col - bgn_col == 0) { + adv(scanner, lexer); + } + if (scanner->cur_col - bgn_col == 1) { + bool is_plain_first = + (is_ns_char(bgn_chr) && !is_c_indicator(bgn_chr)) || + ((bgn_chr == '-' || bgn_chr == '?' || bgn_chr == ':') && is_plain_safe(lexer->lookahead)); + if (!is_plain_first) { + return false; + } + scanner->sch_stt = adv_sch_stt(scanner->sch_stt, scanner->cur_chr, &scanner->rlt_sch); + } else { + // no need to check the following cases: + // ..X + // ...X + // --X + // ---X + // X: lookahead + scanner->sch_stt = SCH_STT_FRZ; // must be RS_STR + } + + mrk_end(scanner, lexer); + + for (;;) { + if (!is_nwl(lexer->lookahead)) { + if (scn_pln_cnt(scanner, lexer, is_plain_safe) != SCN_SUCC) { + break; + } + } + if (lexer->lookahead == 0 || !is_nwl(lexer->lookahead)) { + break; + } + for (;;) { + if (is_nwl(lexer->lookahead)) { + adv_nwl(scanner, lexer); + } else if (is_wsp(lexer->lookahead)) { + adv(scanner, lexer); + } else { + break; + } + } + if (lexer->lookahead == 0 || scanner->cur_col <= cur_ind) { + break; + } + if (scanner->cur_col == 0 && scn_drs_doc_end(scanner, lexer)) { + break; + } + } + + if (scanner->end_row == bgn_row) { + if (maybe_sgl_pln_blk) { + MAY_UPD_IMP_COL(); + RET_SYM(is_r ? SGL_PLN_SYM(R, BLK) : is_br ? SGL_PLN_SYM(BR, BLK) : SGL_PLN_SYM(B, BLK)); + } + if (maybe_sgl_pln_flw) + RET_SYM(is_r ? SGL_PLN_SYM(R, FLW) : SGL_PLN_SYM(BR, FLW)); + } else { + if (maybe_mtl_pln_blk) { + MAY_UPD_IMP_COL(); + RET_SYM(is_r ? R_MTL_PLN_STR_BLK : BR_MTL_PLN_STR_BLK); + } + if (maybe_mtl_pln_flw) + RET_SYM(is_r ? R_MTL_PLN_STR_FLW : BR_MTL_PLN_STR_FLW); + } + + return false; + } + + return false; +} + +void *tree_sitter_yaml_external_scanner_create() { + Scanner *scanner = calloc(1, sizeof(Scanner)); + deserialize(scanner, NULL, 0); + return scanner; +} + +void tree_sitter_yaml_external_scanner_destroy(void *payload) { + Scanner *scanner = (Scanner *)payload; + free(scanner); +} + +unsigned tree_sitter_yaml_external_scanner_serialize(void *payload, char *buffer) { + Scanner *scanner = (Scanner *)payload; + return serialize(scanner, buffer); +} + +void tree_sitter_yaml_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { + Scanner *scanner = (Scanner *)payload; + deserialize(scanner, buffer, length); +} + +bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { + Scanner *scanner = (Scanner *)payload; + return scan(scanner, lexer, valid_symbols); +} diff --git a/src/scanner.cc b/src/scanner.cc deleted file mode 100644 index 33de032..0000000 --- a/src/scanner.cc +++ /dev/null @@ -1,981 +0,0 @@ -#include -#include - -// tree-sitter does not support multiple files for external scanner -#include "./schema.generated.cc" - -namespace { - -using std::vector; -using namespace tree_sitter_yaml; - -enum TokenType { - END_OF_FILE, - - S_DIR_YML_BGN, R_DIR_YML_VER, - S_DIR_TAG_BGN, R_DIR_TAG_HDL, R_DIR_TAG_PFX, - S_DIR_RSV_BGN, R_DIR_RSV_PRM, - S_DRS_END, - S_DOC_END, - R_BLK_SEQ_BGN, BR_BLK_SEQ_BGN, B_BLK_SEQ_BGN, - R_BLK_KEY_BGN, BR_BLK_KEY_BGN, B_BLK_KEY_BGN, - R_BLK_VAL_BGN, BR_BLK_VAL_BGN, B_BLK_VAL_BGN, - R_BLK_IMP_BGN, - R_BLK_LIT_BGN, BR_BLK_LIT_BGN, - R_BLK_FLD_BGN, BR_BLK_FLD_BGN, - BR_BLK_STR_CTN, - R_FLW_SEQ_BGN, BR_FLW_SEQ_BGN, B_FLW_SEQ_BGN, - R_FLW_SEQ_END, BR_FLW_SEQ_END, - R_FLW_MAP_BGN, BR_FLW_MAP_BGN, B_FLW_MAP_BGN, - R_FLW_MAP_END, BR_FLW_MAP_END, - R_FLW_SEP_BGN, BR_FLW_SEP_BGN, - R_FLW_KEY_BGN, BR_FLW_KEY_BGN, - R_FLW_JSV_BGN, BR_FLW_JSV_BGN, - R_FLW_NJV_BGN, BR_FLW_NJV_BGN, - R_DQT_STR_BGN, BR_DQT_STR_BGN, B_DQT_STR_BGN, - R_DQT_STR_CTN, BR_DQT_STR_CTN, - R_DQT_ESC_NWL, BR_DQT_ESC_NWL, - R_DQT_ESC_SEQ, BR_DQT_ESC_SEQ, - R_DQT_STR_END, BR_DQT_STR_END, - R_SQT_STR_BGN, BR_SQT_STR_BGN, B_SQT_STR_BGN, - R_SQT_STR_CTN, BR_SQT_STR_CTN, - R_SQT_ESC_SQT, BR_SQT_ESC_SQT, - R_SQT_STR_END, BR_SQT_STR_END, - - R_SGL_PLN_NUL_BLK, BR_SGL_PLN_NUL_BLK, B_SGL_PLN_NUL_BLK, R_SGL_PLN_NUL_FLW, BR_SGL_PLN_NUL_FLW, - R_SGL_PLN_BOL_BLK, BR_SGL_PLN_BOL_BLK, B_SGL_PLN_BOL_BLK, R_SGL_PLN_BOL_FLW, BR_SGL_PLN_BOL_FLW, - R_SGL_PLN_INT_BLK, BR_SGL_PLN_INT_BLK, B_SGL_PLN_INT_BLK, R_SGL_PLN_INT_FLW, BR_SGL_PLN_INT_FLW, - R_SGL_PLN_FLT_BLK, BR_SGL_PLN_FLT_BLK, B_SGL_PLN_FLT_BLK, R_SGL_PLN_FLT_FLW, BR_SGL_PLN_FLT_FLW, - R_SGL_PLN_STR_BLK, BR_SGL_PLN_STR_BLK, B_SGL_PLN_STR_BLK, R_SGL_PLN_STR_FLW, BR_SGL_PLN_STR_FLW, - - R_MTL_PLN_STR_BLK, BR_MTL_PLN_STR_BLK, - R_MTL_PLN_STR_FLW, BR_MTL_PLN_STR_FLW, - - R_TAG, BR_TAG, B_TAG, - R_ACR_BGN, BR_ACR_BGN, B_ACR_BGN, R_ACR_CTN, - R_ALS_BGN, BR_ALS_BGN, B_ALS_BGN, R_ALS_CTN, - - BL, - COMMENT, -}; - -#define ADV() adv(lexer) -#define ADV_NWL() adv_nwl(lexer) -#define SKP() skp(lexer) -#define SKP_NWL() skp_nwl(lexer) -#define MRK_END() mrk_end(lexer) -#define LKA lexer->lookahead -#define VLD valid_symbols - -#define SCN_SUCC 1 -#define SCN_STOP 0 -#define SCN_FAIL -1 - -#define IND_ROT 'r' -#define IND_MAP 'm' -#define IND_SEQ 'q' -#define IND_STR 's' - -#define RET_SYM(RESULT_SYMBOL) { \ - flush(); \ - lexer->result_symbol = RESULT_SYMBOL; \ - return true; \ -} - -#define POP_IND() { \ - /* incorrect status caused by error recovering */ \ - if (ind_typ_stk.size() == 1) { \ - return false; \ - } \ - pop_ind(); \ -} -#define PUSH_IND(TYP, LEN) push_ind(TYP, LEN) -#define PUSH_BGN_IND(TYP) { \ - if (has_tab_ind) return false; \ - push_ind(TYP, bgn_col); \ -} -#define MAY_PUSH_IMP_IND(TYP) { \ - if (cur_ind != blk_imp_col) { \ - if (blk_imp_tab) return false; \ - push_ind(IND_MAP, blk_imp_col); \ - } \ -} -#define MAY_PUSH_SPC_SEQ_IND() { \ - if (cur_ind_typ == IND_MAP) { \ - push_ind(IND_SEQ, bgn_col); \ - } \ -} -#define MAY_UPD_IMP_COL() { \ - if (blk_imp_row != bgn_row) { \ - blk_imp_row = bgn_row; \ - blk_imp_col = bgn_col; \ - blk_imp_tab = has_tab_ind; \ - } \ -} -#define UPD_SCH_STT() { \ - sch_stt = adv_sch_stt(sch_stt, cur_chr, &rlt_sch); \ -} -#define SGL_PLN_SYM(POS, CTX) ( \ - rlt_sch == RS_NUL ? POS##_SGL_PLN_NUL_##CTX : \ - rlt_sch == RS_BOL ? POS##_SGL_PLN_BOL_##CTX : \ - rlt_sch == RS_INT ? POS##_SGL_PLN_INT_##CTX : \ - rlt_sch == RS_FLT ? POS##_SGL_PLN_FLT_##CTX : \ - POS##_SGL_PLN_STR_##CTX \ -) - -struct Scanner { - int16_t row; - int16_t col; - int16_t blk_imp_row; - int16_t blk_imp_col; - int16_t blk_imp_tab; - vector ind_typ_stk; - vector ind_len_stk; - - // temp - int16_t end_row; - int16_t end_col; - int16_t cur_row; - int16_t cur_col; - int32_t cur_chr; - int8_t sch_stt; - ResultSchema rlt_sch; - - Scanner() { - deserialize(NULL, 0); - } - - unsigned serialize(char *buffer) { - size_t i = 0; - buffer[i++] = row; - buffer[i++] = col; - buffer[i++] = blk_imp_row; - buffer[i++] = blk_imp_col; - buffer[i++] = blk_imp_tab; - vector::iterator - typ_itr = ind_typ_stk.begin() + 1, - typ_end = ind_typ_stk.end(), - len_itr = ind_len_stk.begin() + 1; - for (; typ_itr != typ_end && i < TREE_SITTER_SERIALIZATION_BUFFER_SIZE; ++typ_itr, ++len_itr) { - buffer[i++] = *typ_itr; - buffer[i++] = *len_itr; - } - return i; - } - - void deserialize(const char *buffer, unsigned length) { - row = 0; - col = 0; - blk_imp_row = -1; - blk_imp_col = -1; - blk_imp_tab = 0; - ind_typ_stk.clear(); - ind_typ_stk.push_back(IND_ROT); - ind_len_stk.clear(); - ind_len_stk.push_back(-1); - if (length > 0) { - size_t i = 0; - row = buffer[i++]; - col = buffer[i++]; - blk_imp_row = buffer[i++]; - blk_imp_col = buffer[i++]; - blk_imp_tab = buffer[i++]; - while (i < length) { - ind_typ_stk.push_back(buffer[i++]); - ind_len_stk.push_back(buffer[i++]); - } - } - } - - void adv(TSLexer *lexer) { - cur_col++; - cur_chr = lexer->lookahead; - lexer->advance(lexer, false); - } - - void adv_nwl(TSLexer *lexer) { - cur_row++; - cur_col = 0; - cur_chr = lexer->lookahead; - lexer->advance(lexer, false); - } - - void skp(TSLexer *lexer) { - cur_col++; - cur_chr = lexer->lookahead; - lexer->advance(lexer, true); - } - - void skp_nwl(TSLexer *lexer) { - cur_row++; - cur_col = 0; - cur_chr = lexer->lookahead; - lexer->advance(lexer, true); - } - - void mrk_end(TSLexer *lexer) { - end_row = cur_row; - end_col = cur_col; - lexer->mark_end(lexer); - } - - void init() { - cur_row = row; - cur_col = col; - cur_chr = 0; - sch_stt = 0; - rlt_sch = RS_STR; - } - - void flush() { - row = end_row; - col = end_col; - } - - void pop_ind() { - ind_len_stk.pop_back(); - ind_typ_stk.pop_back(); - } - - void push_ind(int16_t typ, int16_t len) { - ind_len_stk.push_back(len); - ind_typ_stk.push_back(typ); - } - - bool is_wsp(int32_t c) { - return c == ' ' || c == '\t'; - } - - bool is_nwl(int32_t c) { - return c == '\r' || c == '\n'; - } - - bool is_wht(int32_t c) { - return is_wsp(c) || is_nwl(c) || c == 0; - } - - bool is_ns_dec_digit(int32_t c) { - return c >= '0' && c <= '9'; - } - - bool is_ns_hex_digit(int32_t c) { - return is_ns_dec_digit(c) - || (c >= 'a' && c <= 'f') - || (c >= 'A' && c <= 'F'); - } - - bool is_ns_word_char(int32_t c) { - return c == '-' - || (c >= '0' && c <= '9') - || (c >= 'a' && c <= 'z') - || (c >= 'A' && c <= 'Z'); - } - - bool is_nb_json(int32_t c) { - return c == 0x09 || (c >= 0x20 && c <= 0x10ffff); - } - - bool is_nb_double_char(int32_t c) { - return is_nb_json(c) && c != '\\' && c != '"'; - } - - bool is_nb_single_char(int32_t c) { - return is_nb_json(c) && c != '\''; - } - - bool is_ns_char(int32_t c) { - return (c >= 0x21 && c <= 0x7e) - || c == 0x85 - || (c >= 0xa0 && c <= 0xd7ff) - || (c >= 0xe000 && c <= 0xfefe) - || (c >= 0xff00 && c <= 0xfffd) - || (c >= 0x10000 && c <= 0x10ffff); - } - - bool is_c_indicator(int32_t c) { - return c == '-' || c == '?' || c == ':' || c == ',' || c == '[' || c == ']' || c == '{' || c == '}' - || c == '#' || c == '&' || c == '*' || c == '!' || c == '|' || c == '>' || c == '\'' || c == '"' - || c == '%' || c == '@' || c == '`'; - } - - bool is_c_flow_indicator(int32_t c) { - return c == ',' || c == '[' || c == ']' || c == '{' || c == '}'; - } - - bool is_plain_safe_in_block(int32_t c) { - return is_ns_char(c); - } - - bool is_plain_safe_in_flow(int32_t c) { - return is_ns_char(c) && !is_c_flow_indicator(c); - } - - bool is_ns_uri_char(int32_t c) { - return is_ns_word_char(c) - || c == '#' || c == ';' || c == '/' || c == '?' || c == ':' || c == '@' || c == '&' - || c == '=' || c == '+' || c == '$' || c == ',' || c == '_' || c == '.' || c == '!' - || c == '~' || c == '*' || c == '\'' || c == '(' || c == ')' || c == '[' || c == ']'; - } - - bool is_ns_tag_char(int32_t c) { - return is_ns_word_char(c) - || c == '#' || c == ';' || c == '/' || c == '?' || c == ':' || c == '@' || c == '&' - || c == '=' || c == '+' || c == '$' || c == '_' || c == '.' - || c == '~' || c == '*' || c == '\'' || c == '(' || c == ')'; - } - - bool is_ns_anchor_char(int32_t c) { - return is_ns_char(c) && !is_c_flow_indicator(c); - } - - char scn_uri_esc(TSLexer *lexer) { - if (LKA != '%') return SCN_STOP; - MRK_END(); - ADV(); - if (!is_ns_hex_digit(LKA)) return SCN_FAIL; - ADV(); - if (!is_ns_hex_digit(LKA)) return SCN_FAIL; - ADV(); - return SCN_SUCC; - } - - char scn_ns_uri_char(TSLexer *lexer) { - if (is_ns_uri_char(LKA)) {ADV(); return SCN_SUCC;} - return scn_uri_esc(lexer); - } - - char scn_ns_tag_char(TSLexer *lexer) { - if (is_ns_tag_char(LKA)) {ADV(); return SCN_SUCC;} - return scn_uri_esc(lexer); - } - - bool scn_dir_bgn(TSLexer *lexer) { - ADV(); - if (LKA == 'Y') { - ADV(); - if (LKA == 'A') { - ADV(); - if (LKA == 'M') { - ADV(); - if (LKA == 'L') { - ADV(); - if (is_wht(LKA)) { - MRK_END(); - RET_SYM(S_DIR_YML_BGN); - } - } - } - } - } else if (LKA == 'T') { - ADV(); - if (LKA == 'A') { - ADV(); - if (LKA == 'G') { - ADV(); - if (is_wht(LKA)) { - MRK_END(); - RET_SYM(S_DIR_TAG_BGN); - } - } - } - } - for (;;) { - if (!is_ns_char(LKA)) break; - ADV(); - } - if (cur_col > 1 && is_wht(LKA)) { - MRK_END(); - RET_SYM(S_DIR_RSV_BGN); - } - return false; - } - - bool scn_dir_yml_ver(TSLexer *lexer, TSSymbol result_symbol) { - uint16_t n1 = 0; - uint16_t n2 = 0; - while (is_ns_dec_digit(LKA)) {ADV();n1++;} - if (LKA != '.') return false; - ADV(); - while (is_ns_dec_digit(LKA)) {ADV();n2++;} - if (n1 == 0 || n2 == 0) return false; - MRK_END(); - RET_SYM(result_symbol); - } - - bool scn_tag_hdl_tal(TSLexer *lexer) { - if (LKA == '!') {ADV();return true;} - uint16_t n = 0; - while (is_ns_word_char(LKA)) {ADV();n++;} - if (n == 0) return true; - if (LKA == '!') {ADV();return true;} - return false; - } - - bool scn_dir_tag_hdl(TSLexer *lexer, TSSymbol result_symbol) { - if (LKA == '!') { - ADV(); - if (scn_tag_hdl_tal(lexer)) {MRK_END();RET_SYM(result_symbol);} - } - return false; - } - - bool scn_dir_tag_pfx(TSLexer *lexer, TSSymbol result_symbol) { - if (LKA == '!') ADV(); - else if (scn_ns_tag_char(lexer) == SCN_SUCC); - else return false; - for (;;) { - switch (scn_ns_uri_char(lexer)) { - case SCN_STOP: - MRK_END(); - case SCN_FAIL: - RET_SYM(result_symbol); - } - } - } - - bool scn_dir_rsv_prm(TSLexer *lexer, TSSymbol result_symbol) { - if (!is_ns_char(LKA)) return false; - ADV(); - while (is_ns_char(LKA)) ADV(); - MRK_END(); - RET_SYM(result_symbol); - } - - bool scn_tag(TSLexer *lexer, TSSymbol result_symbol) { - if (LKA != '!') return false; - ADV(); - if (is_wht(LKA)) {MRK_END();RET_SYM(result_symbol);} - if (LKA == '<') { - ADV(); - if (scn_ns_uri_char(lexer) != SCN_SUCC) return false; - for (;;) { - switch (scn_ns_uri_char(lexer)) { - case SCN_STOP: - if (LKA == '>') { - ADV(); - MRK_END(); - RET_SYM(result_symbol); - } - case SCN_FAIL: - return false; - } - } - } else { - if (scn_tag_hdl_tal(lexer) && scn_ns_tag_char(lexer) != SCN_SUCC) return false; - for (;;) { - switch (scn_ns_tag_char(lexer)) { - case SCN_STOP: - MRK_END(); - case SCN_FAIL: - RET_SYM(result_symbol); - } - } - } - return false; - } - - bool scn_acr_bgn(TSLexer *lexer, TSSymbol result_symbol) { - if (LKA != '&') return false; - ADV(); - if (!is_ns_anchor_char(LKA)) return false; - MRK_END(); - RET_SYM(result_symbol); - } - - bool scn_acr_ctn(TSLexer *lexer, TSSymbol result_symbol) { - while (is_ns_anchor_char(LKA)) ADV(); - MRK_END(); - RET_SYM(result_symbol); - } - - bool scn_als_bgn(TSLexer *lexer, TSSymbol result_symbol) { - if (LKA != '*') return false; - ADV(); - if (!is_ns_anchor_char(LKA)) return false; - MRK_END(); - RET_SYM(result_symbol); - } - - bool scn_als_ctn(TSLexer *lexer, TSSymbol result_symbol) { - while (is_ns_anchor_char(LKA)) ADV(); - MRK_END(); - RET_SYM(result_symbol); - } - - bool scn_dqt_esc_seq(TSLexer *lexer, TSSymbol result_symbol) { - uint16_t i; - switch (LKA) { - case '0': case 'a': case 'b': case 't': case '\t': case 'n': case 'v': - case 'r': case 'e': case ' ': case '"': case '/': case '\\': case 'N': - case '_': case 'L': case 'P': - ADV(); - break; - case 'U': - ADV(); - for (i = 0; i < 8; i++) if (is_ns_hex_digit(LKA)) ADV(); else return false; - break; - case 'u': - ADV(); - for (i = 0; i < 4; i++) if (is_ns_hex_digit(LKA)) ADV(); else return false; - break; - case 'x': - ADV(); - for (i = 0; i < 2; i++) if (is_ns_hex_digit(LKA)) ADV(); else return false; - break; - default: - return false; - } - MRK_END(); - RET_SYM(result_symbol); - } - - bool scn_dqt_str_cnt(TSLexer *lexer, TSSymbol result_symbol) { - if (!is_nb_double_char(LKA)) return false; - if (cur_col == 0 && scn_drs_doc_end(lexer)) { - MRK_END(); - RET_SYM(cur_chr == '-' ? S_DRS_END : S_DOC_END); - } else ADV(); - while (is_nb_double_char(LKA)) ADV(); - MRK_END(); - RET_SYM(result_symbol); - } - - bool scn_sqt_str_cnt(TSLexer *lexer, TSSymbol result_symbol) { - if (!is_nb_single_char(LKA)) return false; - if (cur_col == 0 && scn_drs_doc_end(lexer)) { - MRK_END(); - RET_SYM(cur_chr == '-' ? S_DRS_END : S_DOC_END); - } else ADV(); - while (is_nb_single_char(LKA)) ADV(); - MRK_END(); - RET_SYM(result_symbol); - } - - bool scn_blk_str_bgn(TSLexer *lexer, TSSymbol result_symbol) { - if (LKA != '|' && LKA != '>') return false; - ADV(); - int16_t cur_ind = ind_len_stk.back(); - int16_t ind = -1; - if (LKA >= '1' && LKA <= '9') { - ind = LKA - '1'; - ADV(); - if (LKA == '+' || LKA == '-') { - ADV(); - } - } else if (LKA == '+' || LKA == '-') { - ADV(); - if (LKA >= '1' && LKA <= '9') { - ind = LKA - '1'; - ADV(); - } - } - if (!is_wht(LKA)) return false; - MRK_END(); - if (ind != -1) ind += cur_ind; - else { - ind = cur_ind; - while (is_wsp(LKA)) ADV(); - if (LKA == '#') { - ADV(); - while (!is_nwl(LKA) && LKA != 0) ADV(); - } - if (is_nwl(LKA)) ADV_NWL(); - while (LKA != 0) { - if (LKA == ' ') ADV(); - else if (is_nwl(LKA)) { - if (cur_col - 1 < ind) break; - ind = cur_col - 1; - ADV_NWL(); - } else { - if (cur_col - 1 > ind) ind = cur_col - 1; - break; - } - } - } - PUSH_IND(IND_STR, ind); - RET_SYM(result_symbol); - } - - bool scn_blk_str_cnt(TSLexer *lexer, TSSymbol result_symbol) { - if (!is_ns_char(LKA)) return false; - if (cur_col == 0 && scn_drs_doc_end(lexer)) {POP_IND();RET_SYM(BL);} - else ADV(); - MRK_END(); - for (;;) { - if (is_ns_char(LKA)) { - ADV(); - while (is_ns_char(LKA)) ADV(); - MRK_END(); - } - if (is_wsp(LKA)) { - ADV(); - while (is_wsp(LKA)) ADV(); - } else break; - } - RET_SYM(result_symbol); - } - - char scn_pln_cnt(TSLexer *lexer, bool (Scanner::*is_plain_safe)(int32_t)) { - bool is_cur_wsp = is_wsp(cur_chr); - bool is_cur_saf = (this->*is_plain_safe)(cur_chr); - bool is_lka_wsp = is_wsp(LKA); - bool is_lka_saf = (this->*is_plain_safe)(LKA); - if (is_lka_saf || is_lka_wsp) { - for (;;) { - if (is_lka_saf && LKA != '#' && LKA != ':') {ADV();MRK_END();UPD_SCH_STT();} - else if (is_cur_saf && LKA == '#') {ADV();MRK_END();UPD_SCH_STT();} - else if (is_lka_wsp) {ADV();UPD_SCH_STT();} - else if (LKA == ':') ADV(); // check later - else break; - - is_cur_wsp = is_lka_wsp; - is_cur_saf = is_lka_saf; - is_lka_wsp = is_wsp(LKA); - is_lka_saf = (this->*is_plain_safe)(LKA); - - if (cur_chr == ':') { - if (is_lka_saf) {MRK_END();UPD_SCH_STT();} - else return SCN_FAIL; - } - } - } else return SCN_STOP; - return SCN_SUCC; - } - - bool scn_drs_doc_end(TSLexer *lexer) { - if (LKA != '-' && LKA != '.') return false; - int32_t delimeter = LKA; - ADV(); - if (LKA == delimeter) { - ADV(); - if (LKA == delimeter) { - ADV(); - if (is_wht(LKA)) return true; - } - } - MRK_END(); - return false; - } - - bool scan(TSLexer *lexer, const bool *valid_symbols) { - init(); - MRK_END(); - - bool allow_comment = !(VLD[R_DQT_STR_CTN] || VLD[BR_DQT_STR_CTN] || VLD[R_SQT_STR_CTN] || VLD[BR_SQT_STR_CTN]); - - vector::reverse_iterator ind_ptr = ind_len_stk.rbegin(); - vector::reverse_iterator ind_end = ind_len_stk.rend(); - int16_t cur_ind = *ind_ptr++; - int16_t prt_ind = ind_ptr == ind_end ? -1 : *ind_ptr; - int16_t cur_ind_typ = ind_typ_stk.back(); - - bool has_tab_ind = false; - int16_t leading_spaces = 0; - - for (;;) { - if (LKA == ' ') { - if (!has_tab_ind) leading_spaces++; - SKP(); - } else if (LKA == '\t') { - has_tab_ind = true; - SKP(); - } else if (is_nwl(LKA)) { - has_tab_ind = false; - leading_spaces = 0; - SKP_NWL(); - } else if (allow_comment && LKA == '#') { - if (VLD[BR_BLK_STR_CTN] && VLD[BL] && cur_col <= cur_ind) {POP_IND();RET_SYM(BL);} - if ( - VLD[BR_BLK_STR_CTN] - ? cur_row == row - : cur_col == 0 || cur_row != row || cur_col > col - ) { - ADV(); - while (!is_nwl(LKA) && LKA != 0) ADV(); - MRK_END(); - RET_SYM(COMMENT); - } else break; - } else break; - } - - if (LKA == 0) { - if (VLD[BL]) {MRK_END();POP_IND();RET_SYM(BL)} - if (VLD[END_OF_FILE]) {MRK_END();RET_SYM(END_OF_FILE)} - return false; - } - - int16_t bgn_row = cur_row; - int16_t bgn_col = cur_col; - int32_t bgn_chr = LKA; - - if (VLD[BL] && bgn_col <= cur_ind && !has_tab_ind) { - if ( - cur_ind == prt_ind && cur_ind_typ == IND_SEQ - ? bgn_col < cur_ind || LKA != '-' - : bgn_col <= prt_ind || cur_ind_typ == IND_STR - ) {POP_IND();RET_SYM(BL);} - } - - bool has_nwl = cur_row > row; - bool is_r = !has_nwl; - bool is_br = has_nwl && leading_spaces > cur_ind; - bool is_b = has_nwl && leading_spaces == cur_ind && !has_tab_ind; - bool is_s = bgn_col == 0; - - if (VLD[R_DIR_YML_VER] && is_r) return scn_dir_yml_ver(lexer, R_DIR_YML_VER); - if (VLD[R_DIR_TAG_HDL] && is_r) return scn_dir_tag_hdl(lexer, R_DIR_TAG_HDL); - if (VLD[R_DIR_TAG_PFX] && is_r) return scn_dir_tag_pfx(lexer, R_DIR_TAG_PFX); - if (VLD[R_DIR_RSV_PRM] && is_r) return scn_dir_rsv_prm(lexer, R_DIR_RSV_PRM); - if (VLD[BR_BLK_STR_CTN] && is_br && scn_blk_str_cnt(lexer, BR_BLK_STR_CTN)) return true; - - if ( - (VLD[R_DQT_STR_CTN] && is_r && scn_dqt_str_cnt(lexer, R_DQT_STR_CTN)) - || (VLD[BR_DQT_STR_CTN] && is_br && scn_dqt_str_cnt(lexer, BR_DQT_STR_CTN)) - ) return true; - - if ( - (VLD[R_SQT_STR_CTN] && is_r && scn_sqt_str_cnt(lexer, R_SQT_STR_CTN)) - || (VLD[BR_SQT_STR_CTN] && is_br && scn_sqt_str_cnt(lexer, BR_SQT_STR_CTN)) - ) return true; - - if (VLD[R_ACR_CTN] && is_r) return scn_acr_ctn(lexer, R_ACR_CTN); - if (VLD[R_ALS_CTN] && is_r) return scn_als_ctn(lexer, R_ALS_CTN); - - if (LKA == '%') { - if (VLD[S_DIR_YML_BGN] && is_s) return scn_dir_bgn(lexer); - } else if (LKA == '*') { - if (VLD[R_ALS_BGN] && is_r) {MAY_UPD_IMP_COL();return scn_als_bgn(lexer, R_ALS_BGN);} - if (VLD[BR_ALS_BGN] && is_br) {MAY_UPD_IMP_COL();return scn_als_bgn(lexer, BR_ALS_BGN);} - if (VLD[B_ALS_BGN] && is_b) {MAY_UPD_IMP_COL();return scn_als_bgn(lexer, B_ALS_BGN);} - } else if (LKA == '&') { - if (VLD[R_ACR_BGN] && is_r) {MAY_UPD_IMP_COL();return scn_acr_bgn(lexer, R_ACR_BGN);} - if (VLD[BR_ACR_BGN] && is_br) {MAY_UPD_IMP_COL();return scn_acr_bgn(lexer, BR_ACR_BGN);} - if (VLD[B_ACR_BGN] && is_b) {MAY_UPD_IMP_COL();return scn_acr_bgn(lexer, B_ACR_BGN);} - } else if (LKA == '!') { - if (VLD[R_TAG] && is_r) {MAY_UPD_IMP_COL();return scn_tag(lexer, R_TAG);} - if (VLD[BR_TAG] && is_br) {MAY_UPD_IMP_COL();return scn_tag(lexer, BR_TAG);} - if (VLD[B_TAG] && is_b) {MAY_UPD_IMP_COL();return scn_tag(lexer, B_TAG);} - } else if (LKA == '[') { - if (VLD[R_FLW_SEQ_BGN] && is_r) {MAY_UPD_IMP_COL();ADV();MRK_END();RET_SYM(R_FLW_SEQ_BGN)} - if (VLD[BR_FLW_SEQ_BGN] && is_br) {MAY_UPD_IMP_COL();ADV();MRK_END();RET_SYM(BR_FLW_SEQ_BGN)} - if (VLD[B_FLW_SEQ_BGN] && is_b) {MAY_UPD_IMP_COL();ADV();MRK_END();RET_SYM(B_FLW_SEQ_BGN)} - } else if (LKA == ']') { - if (VLD[R_FLW_SEQ_END] && is_r) {ADV();MRK_END();RET_SYM(R_FLW_SEQ_END)} - if (VLD[BR_FLW_SEQ_END] && is_br) {ADV();MRK_END();RET_SYM(BR_FLW_SEQ_END)} - } else if (LKA == '{') { - if (VLD[R_FLW_MAP_BGN] && is_r) {MAY_UPD_IMP_COL();ADV();MRK_END();RET_SYM(R_FLW_MAP_BGN)} - if (VLD[BR_FLW_MAP_BGN] && is_br) {MAY_UPD_IMP_COL();ADV();MRK_END();RET_SYM(BR_FLW_MAP_BGN)} - if (VLD[B_FLW_MAP_BGN] && is_b) {MAY_UPD_IMP_COL();ADV();MRK_END();RET_SYM(B_FLW_MAP_BGN)} - } else if (LKA == '}') { - if (VLD[R_FLW_MAP_END] && is_r) {ADV();MRK_END();RET_SYM(R_FLW_MAP_END)} - if (VLD[BR_FLW_MAP_END] && is_br) {ADV();MRK_END();RET_SYM(BR_FLW_MAP_END)} - } else if (LKA == ',') { - if (VLD[R_FLW_SEP_BGN] && is_r) {ADV();MRK_END();RET_SYM(R_FLW_SEP_BGN)} - if (VLD[BR_FLW_SEP_BGN] && is_br) {ADV();MRK_END();RET_SYM(BR_FLW_SEP_BGN)} - } else if (LKA == '"') { - if (VLD[R_DQT_STR_BGN] && is_r) {MAY_UPD_IMP_COL();ADV();MRK_END();RET_SYM(R_DQT_STR_BGN)} - if (VLD[BR_DQT_STR_BGN] && is_br) {MAY_UPD_IMP_COL();ADV();MRK_END();RET_SYM(BR_DQT_STR_BGN)} - if (VLD[B_DQT_STR_BGN] && is_b) {MAY_UPD_IMP_COL();ADV();MRK_END();RET_SYM(B_DQT_STR_BGN)} - if (VLD[R_DQT_STR_END] && is_r) {ADV();MRK_END();RET_SYM(R_DQT_STR_END)} - if (VLD[BR_DQT_STR_END] && is_br) {ADV();MRK_END();RET_SYM(BR_DQT_STR_END)} - } else if (LKA == '\'') { - if (VLD[R_SQT_STR_BGN] && is_r) {MAY_UPD_IMP_COL();ADV();MRK_END();RET_SYM(R_SQT_STR_BGN)} - if (VLD[BR_SQT_STR_BGN] && is_br) {MAY_UPD_IMP_COL();ADV();MRK_END();RET_SYM(BR_SQT_STR_BGN)} - if (VLD[B_SQT_STR_BGN] && is_b) {MAY_UPD_IMP_COL();ADV();MRK_END();RET_SYM(B_SQT_STR_BGN)} - if (VLD[R_SQT_STR_END] && is_r) { - ADV(); - if (LKA == '\'') {ADV();MRK_END();RET_SYM(R_SQT_ESC_SQT)} - else {MRK_END();RET_SYM(R_SQT_STR_END)} - } - if (VLD[BR_SQT_STR_END] && is_br) { - ADV(); - if (LKA == '\'') {ADV();MRK_END();RET_SYM(BR_SQT_ESC_SQT)} - else {MRK_END();RET_SYM(BR_SQT_STR_END)} - } - } else if (LKA == '?') { - bool is_r_blk_key_bgn = VLD[R_BLK_KEY_BGN] && is_r; - bool is_br_blk_key_bgn = VLD[BR_BLK_KEY_BGN] && is_br; - bool is_b_blk_key_bgn = VLD[B_BLK_KEY_BGN] && is_b; - bool is_r_flw_key_bgn = VLD[R_FLW_KEY_BGN] && is_r; - bool is_br_flw_key_bgn = VLD[BR_FLW_KEY_BGN] && is_br; - if (is_r_blk_key_bgn || is_br_blk_key_bgn || is_b_blk_key_bgn || is_r_flw_key_bgn || is_br_flw_key_bgn) { - ADV(); - if (is_wht(LKA)) { - MRK_END(); - if (is_r_blk_key_bgn) {PUSH_BGN_IND(IND_MAP);RET_SYM(R_BLK_KEY_BGN);} - if (is_br_blk_key_bgn) {PUSH_BGN_IND(IND_MAP);RET_SYM(BR_BLK_KEY_BGN);} - if (is_b_blk_key_bgn) RET_SYM(B_BLK_KEY_BGN); - if (is_r_flw_key_bgn) RET_SYM(R_FLW_KEY_BGN); - if (is_br_flw_key_bgn) RET_SYM(BR_FLW_KEY_BGN); - } - } - } else if (LKA == ':') { - if (VLD[R_FLW_JSV_BGN] && is_r) {ADV();MRK_END();RET_SYM(R_FLW_JSV_BGN);} - if (VLD[BR_FLW_JSV_BGN] && is_br) {ADV();MRK_END();RET_SYM(BR_FLW_JSV_BGN);} - bool is_r_blk_val_bgn = VLD[R_BLK_VAL_BGN] && is_r; - bool is_br_blk_val_bgn = VLD[BR_BLK_VAL_BGN] && is_br; - bool is_b_blk_val_bgn = VLD[B_BLK_VAL_BGN] && is_b; - bool is_r_blk_imp_bgn = VLD[R_BLK_IMP_BGN] && is_r; - bool is_r_flw_njv_bgn = VLD[R_FLW_NJV_BGN] && is_r; - bool is_br_flw_njv_bgn = VLD[BR_FLW_NJV_BGN] && is_br; - if (is_r_blk_val_bgn || is_br_blk_val_bgn || is_b_blk_val_bgn || is_r_blk_imp_bgn || is_r_flw_njv_bgn || is_br_flw_njv_bgn) { - ADV(); - bool is_lka_wht = is_wht(LKA); - if (is_lka_wht) { - if (is_r_blk_val_bgn) {PUSH_BGN_IND(IND_MAP);MRK_END();RET_SYM(R_BLK_VAL_BGN);} - if (is_br_blk_val_bgn) {PUSH_BGN_IND(IND_MAP);MRK_END();RET_SYM(BR_BLK_VAL_BGN);} - if (is_b_blk_val_bgn) {MRK_END();RET_SYM(B_BLK_VAL_BGN);} - if (is_r_blk_imp_bgn) {MAY_PUSH_IMP_IND();MRK_END();RET_SYM(R_BLK_IMP_BGN);} - } - if (is_lka_wht || LKA == ',' || LKA == ']' || LKA == '}') { - if (is_r_flw_njv_bgn) {MRK_END();RET_SYM(R_FLW_NJV_BGN);} - if (is_br_flw_njv_bgn) {MRK_END();RET_SYM(BR_FLW_NJV_BGN);} - } - } - } else if (LKA == '-') { - bool is_r_blk_seq_bgn = VLD[R_BLK_SEQ_BGN] && is_r; - bool is_br_blk_seq_bgn = VLD[BR_BLK_SEQ_BGN] && is_br; - bool is_b_blk_seq_bgn = VLD[B_BLK_SEQ_BGN] && is_b; - bool is_s_drs_end = is_s; - if (is_r_blk_seq_bgn || is_br_blk_seq_bgn || is_b_blk_seq_bgn || is_s_drs_end) { - ADV(); - if (is_wht(LKA)) { - if (is_r_blk_seq_bgn) {PUSH_BGN_IND(IND_SEQ);MRK_END();RET_SYM(R_BLK_SEQ_BGN)} - if (is_br_blk_seq_bgn) {PUSH_BGN_IND(IND_SEQ);MRK_END();RET_SYM(BR_BLK_SEQ_BGN)} - if (is_b_blk_seq_bgn) {MAY_PUSH_SPC_SEQ_IND();MRK_END();RET_SYM(B_BLK_SEQ_BGN)} - } else if (LKA == '-' && is_s_drs_end) { - ADV(); - if (LKA == '-') { - ADV(); - if (is_wht(LKA)) { - if (VLD[BL]) {POP_IND();RET_SYM(BL);} - MRK_END(); - RET_SYM(S_DRS_END); - } - } - } - } - } else if (LKA == '.') { - if (is_s) { - ADV(); - if (LKA == '.') { - ADV(); - if (LKA == '.') { - ADV(); - if (is_wht(LKA)) { - if (VLD[BL]) {POP_IND();RET_SYM(BL);} - MRK_END(); - RET_SYM(S_DOC_END); - } - } - } - } - } else if (LKA == '\\') { - bool is_r_dqt_esc_nwl = VLD[R_DQT_ESC_NWL] && is_r; - bool is_br_dqt_esc_nwl = VLD[BR_DQT_ESC_NWL] && is_br; - bool is_r_dqt_esc_seq = VLD[R_DQT_ESC_SEQ] && is_r; - bool is_br_dqt_esc_seq = VLD[BR_DQT_ESC_SEQ] && is_br; - if (is_r_dqt_esc_nwl || is_br_dqt_esc_nwl || is_r_dqt_esc_seq || is_br_dqt_esc_seq) { - ADV(); - if (is_nwl(LKA)) { - if (is_r_dqt_esc_nwl) {MRK_END();RET_SYM(R_DQT_ESC_NWL)} - if (is_br_dqt_esc_nwl) {MRK_END();RET_SYM(BR_DQT_ESC_NWL)} - } - if (is_r_dqt_esc_seq) return scn_dqt_esc_seq(lexer, R_DQT_ESC_SEQ); - if (is_br_dqt_esc_seq) return scn_dqt_esc_seq(lexer, BR_DQT_ESC_SEQ); - return false; - } - } else if (LKA == '|') { - if (VLD[R_BLK_LIT_BGN] && is_r) return scn_blk_str_bgn(lexer, R_BLK_LIT_BGN); - if (VLD[BR_BLK_LIT_BGN] && is_br) return scn_blk_str_bgn(lexer, BR_BLK_LIT_BGN); - } else if (LKA == '>') { - if (VLD[R_BLK_FLD_BGN] && is_r) return scn_blk_str_bgn(lexer, R_BLK_FLD_BGN); - if (VLD[BR_BLK_FLD_BGN] && is_br) return scn_blk_str_bgn(lexer, BR_BLK_FLD_BGN); - } - - bool maybe_sgl_pln_blk = (VLD[R_SGL_PLN_STR_BLK] && is_r) || (VLD[BR_SGL_PLN_STR_BLK] && is_br) || (VLD[B_SGL_PLN_STR_BLK] && is_b); - bool maybe_sgl_pln_flw = (VLD[R_SGL_PLN_STR_FLW] && is_r) || (VLD[BR_SGL_PLN_STR_FLW] && is_br); - bool maybe_mtl_pln_blk = (VLD[R_MTL_PLN_STR_BLK] && is_r) || (VLD[BR_MTL_PLN_STR_BLK] && is_br); - bool maybe_mtl_pln_flw = (VLD[R_MTL_PLN_STR_FLW] && is_r) || (VLD[BR_MTL_PLN_STR_FLW] && is_br); - - if (maybe_sgl_pln_blk || maybe_sgl_pln_flw || maybe_mtl_pln_blk || maybe_mtl_pln_flw) { - bool is_in_blk = maybe_sgl_pln_blk || maybe_mtl_pln_blk; - bool (Scanner::*is_plain_safe)(int32_t) = is_in_blk ? &Scanner::is_plain_safe_in_block : &Scanner::is_plain_safe_in_flow; - if (cur_col - bgn_col == 0) ADV(); - if (cur_col - bgn_col == 1) { - bool is_plain_first = - (is_ns_char(bgn_chr) && !is_c_indicator(bgn_chr)) - || ((bgn_chr == '-' || bgn_chr == '?' || bgn_chr == ':') - && (this->*is_plain_safe)(LKA)); - if (!is_plain_first) return false; - UPD_SCH_STT(); - } else { - // no need to check the following cases: - // ..X - // ...X - // --X - // ---X - // X: lookahead - sch_stt = SCH_STT_FRZ; // must be RS_STR - } - - MRK_END(); - - for (;;) { - if (!is_nwl(LKA)) { - if (scn_pln_cnt(lexer, is_plain_safe) != SCN_SUCC) break; - } - if (LKA == 0 || !is_nwl(LKA)) break; - for (;;) { - if (is_nwl(LKA)) ADV_NWL(); - else if (is_wsp(LKA)) ADV(); - else break; - } - if (LKA == 0 || cur_col <= cur_ind) break; - if (cur_col == 0 && scn_drs_doc_end(lexer)) break; - } - - if (end_row == bgn_row) { - if (maybe_sgl_pln_blk) {MAY_UPD_IMP_COL();RET_SYM(is_r ? SGL_PLN_SYM(R, BLK) : is_br ? SGL_PLN_SYM(BR, BLK) : SGL_PLN_SYM(B, BLK));} - if (maybe_sgl_pln_flw) RET_SYM(is_r ? SGL_PLN_SYM(R, FLW) : SGL_PLN_SYM(BR, FLW)); - } else { - if (maybe_mtl_pln_blk) {MAY_UPD_IMP_COL();RET_SYM(is_r ? R_MTL_PLN_STR_BLK : BR_MTL_PLN_STR_BLK);} - if (maybe_mtl_pln_flw) RET_SYM(is_r ? R_MTL_PLN_STR_FLW : BR_MTL_PLN_STR_FLW); - } - - return false; - } - - return false; - } -}; - -} - -extern "C" { - -void *tree_sitter_yaml_external_scanner_create() { - return new Scanner(); -} - -void tree_sitter_yaml_external_scanner_destroy(void *payload) { - Scanner *scanner = static_cast(payload); - delete scanner; -} - -unsigned tree_sitter_yaml_external_scanner_serialize(void *payload, char *buffer) { - Scanner *scanner = static_cast(payload); - return scanner->serialize(buffer); -} - -void tree_sitter_yaml_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { - Scanner *scanner = static_cast(payload); - scanner->deserialize(buffer, length); -} - -bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { - Scanner *scanner = static_cast(payload); - return scanner->scan(lexer, valid_symbols); -} - -} diff --git a/src/schema.generated.cc b/src/schema.generated.c similarity index 97% rename from src/schema.generated.cc rename to src/schema.generated.c index f8994a2..2b77b09 100644 --- a/src/schema.generated.cc +++ b/src/schema.generated.c @@ -1,16 +1,16 @@ -#include - -namespace tree_sitter_yaml { +#include +#include +#include const int8_t SCH_STT_FRZ = -1; -enum ResultSchema { +typedef enum { RS_STR, RS_INT, RS_NUL, RS_BOL, RS_FLT, -}; +} ResultSchema; int8_t adv_sch_stt(int8_t sch_stt, int32_t cur_chr, ResultSchema *rlt_sch) { switch (sch_stt) { @@ -197,8 +197,8 @@ int8_t adv_sch_stt(int8_t sch_stt, int32_t cur_chr, ResultSchema *rlt_sch) { *rlt_sch = RS_STR; return -1; } - if (cur_chr != '\r' && cur_chr != '\n' && cur_chr != ' ' && cur_chr != 0) *rlt_sch = RS_STR; + if (cur_chr != '\r' && cur_chr != '\n' && cur_chr != ' ' && cur_chr != 0) { + *rlt_sch = RS_STR; + } return -1; } - -} \ No newline at end of file diff --git a/src/tree_sitter/alloc.h b/src/tree_sitter/alloc.h new file mode 100644 index 0000000..1f4466d --- /dev/null +++ b/src/tree_sitter/alloc.h @@ -0,0 +1,54 @@ +#ifndef TREE_SITTER_ALLOC_H_ +#define TREE_SITTER_ALLOC_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +// Allow clients to override allocation functions +#ifdef TREE_SITTER_REUSE_ALLOCATOR + +extern void *(*ts_current_malloc)(size_t); +extern void *(*ts_current_calloc)(size_t, size_t); +extern void *(*ts_current_realloc)(void *, size_t); +extern void (*ts_current_free)(void *); + +#ifndef ts_malloc +#define ts_malloc ts_current_malloc +#endif +#ifndef ts_calloc +#define ts_calloc ts_current_calloc +#endif +#ifndef ts_realloc +#define ts_realloc ts_current_realloc +#endif +#ifndef ts_free +#define ts_free ts_current_free +#endif + +#else + +#ifndef ts_malloc +#define ts_malloc malloc +#endif +#ifndef ts_calloc +#define ts_calloc calloc +#endif +#ifndef ts_realloc +#define ts_realloc realloc +#endif +#ifndef ts_free +#define ts_free free +#endif + +#endif + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_ALLOC_H_ diff --git a/src/tree_sitter/array.h b/src/tree_sitter/array.h new file mode 100644 index 0000000..15a3b23 --- /dev/null +++ b/src/tree_sitter/array.h @@ -0,0 +1,290 @@ +#ifndef TREE_SITTER_ARRAY_H_ +#define TREE_SITTER_ARRAY_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include "./alloc.h" + +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#pragma warning(disable : 4101) +#elif defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" +#endif + +#define Array(T) \ + struct { \ + T *contents; \ + uint32_t size; \ + uint32_t capacity; \ + } + +/// Initialize an array. +#define array_init(self) \ + ((self)->size = 0, (self)->capacity = 0, (self)->contents = NULL) + +/// Create an empty array. +#define array_new() \ + { NULL, 0, 0 } + +/// Get a pointer to the element at a given `index` in the array. +#define array_get(self, _index) \ + (assert((uint32_t)(_index) < (self)->size), &(self)->contents[_index]) + +/// Get a pointer to the first element in the array. +#define array_front(self) array_get(self, 0) + +/// Get a pointer to the last element in the array. +#define array_back(self) array_get(self, (self)->size - 1) + +/// Clear the array, setting its size to zero. Note that this does not free any +/// memory allocated for the array's contents. +#define array_clear(self) ((self)->size = 0) + +/// Reserve `new_capacity` elements of space in the array. If `new_capacity` is +/// less than the array's current capacity, this function has no effect. +#define array_reserve(self, new_capacity) \ + _array__reserve((Array *)(self), array_elem_size(self), new_capacity) + +/// Free any memory allocated for this array. Note that this does not free any +/// memory allocated for the array's contents. +#define array_delete(self) _array__delete((Array *)(self)) + +/// Push a new `element` onto the end of the array. +#define array_push(self, element) \ + (_array__grow((Array *)(self), 1, array_elem_size(self)), \ + (self)->contents[(self)->size++] = (element)) + +/// Increase the array's size by `count` elements. +/// New elements are zero-initialized. +#define array_grow_by(self, count) \ + do { \ + if ((count) == 0) break; \ + _array__grow((Array *)(self), count, array_elem_size(self)); \ + memset((self)->contents + (self)->size, 0, (count) * array_elem_size(self)); \ + (self)->size += (count); \ + } while (0) + +/// Append all elements from one array to the end of another. +#define array_push_all(self, other) \ + array_extend((self), (other)->size, (other)->contents) + +/// Append `count` elements to the end of the array, reading their values from the +/// `contents` pointer. +#define array_extend(self, count, contents) \ + _array__splice( \ + (Array *)(self), array_elem_size(self), (self)->size, \ + 0, count, contents \ + ) + +/// Remove `old_count` elements from the array starting at the given `index`. At +/// the same index, insert `new_count` new elements, reading their values from the +/// `new_contents` pointer. +#define array_splice(self, _index, old_count, new_count, new_contents) \ + _array__splice( \ + (Array *)(self), array_elem_size(self), _index, \ + old_count, new_count, new_contents \ + ) + +/// Insert one `element` into the array at the given `index`. +#define array_insert(self, _index, element) \ + _array__splice((Array *)(self), array_elem_size(self), _index, 0, 1, &(element)) + +/// Remove one element from the array at the given `index`. +#define array_erase(self, _index) \ + _array__erase((Array *)(self), array_elem_size(self), _index) + +/// Pop the last element off the array, returning the element by value. +#define array_pop(self) ((self)->contents[--(self)->size]) + +/// Assign the contents of one array to another, reallocating if necessary. +#define array_assign(self, other) \ + _array__assign((Array *)(self), (const Array *)(other), array_elem_size(self)) + +/// Swap one array with another +#define array_swap(self, other) \ + _array__swap((Array *)(self), (Array *)(other)) + +/// Get the size of the array contents +#define array_elem_size(self) (sizeof *(self)->contents) + +/// Search a sorted array for a given `needle` value, using the given `compare` +/// callback to determine the order. +/// +/// If an existing element is found to be equal to `needle`, then the `index` +/// out-parameter is set to the existing value's index, and the `exists` +/// out-parameter is set to true. Otherwise, `index` is set to an index where +/// `needle` should be inserted in order to preserve the sorting, and `exists` +/// is set to false. +#define array_search_sorted_with(self, compare, needle, _index, _exists) \ + _array__search_sorted(self, 0, compare, , needle, _index, _exists) + +/// Search a sorted array for a given `needle` value, using integer comparisons +/// of a given struct field (specified with a leading dot) to determine the order. +/// +/// See also `array_search_sorted_with`. +#define array_search_sorted_by(self, field, needle, _index, _exists) \ + _array__search_sorted(self, 0, _compare_int, field, needle, _index, _exists) + +/// Insert a given `value` into a sorted array, using the given `compare` +/// callback to determine the order. +#define array_insert_sorted_with(self, compare, value) \ + do { \ + unsigned _index, _exists; \ + array_search_sorted_with(self, compare, &(value), &_index, &_exists); \ + if (!_exists) array_insert(self, _index, value); \ + } while (0) + +/// Insert a given `value` into a sorted array, using integer comparisons of +/// a given struct field (specified with a leading dot) to determine the order. +/// +/// See also `array_search_sorted_by`. +#define array_insert_sorted_by(self, field, value) \ + do { \ + unsigned _index, _exists; \ + array_search_sorted_by(self, field, (value) field, &_index, &_exists); \ + if (!_exists) array_insert(self, _index, value); \ + } while (0) + +// Private + +typedef Array(void) Array; + +/// This is not what you're looking for, see `array_delete`. +static inline void _array__delete(Array *self) { + if (self->contents) { + ts_free(self->contents); + self->contents = NULL; + self->size = 0; + self->capacity = 0; + } +} + +/// This is not what you're looking for, see `array_erase`. +static inline void _array__erase(Array *self, size_t element_size, + uint32_t index) { + assert(index < self->size); + char *contents = (char *)self->contents; + memmove(contents + index * element_size, contents + (index + 1) * element_size, + (self->size - index - 1) * element_size); + self->size--; +} + +/// This is not what you're looking for, see `array_reserve`. +static inline void _array__reserve(Array *self, size_t element_size, uint32_t new_capacity) { + if (new_capacity > self->capacity) { + if (self->contents) { + self->contents = ts_realloc(self->contents, new_capacity * element_size); + } else { + self->contents = ts_malloc(new_capacity * element_size); + } + self->capacity = new_capacity; + } +} + +/// This is not what you're looking for, see `array_assign`. +static inline void _array__assign(Array *self, const Array *other, size_t element_size) { + _array__reserve(self, element_size, other->size); + self->size = other->size; + memcpy(self->contents, other->contents, self->size * element_size); +} + +/// This is not what you're looking for, see `array_swap`. +static inline void _array__swap(Array *self, Array *other) { + Array swap = *other; + *other = *self; + *self = swap; +} + +/// This is not what you're looking for, see `array_push` or `array_grow_by`. +static inline void _array__grow(Array *self, uint32_t count, size_t element_size) { + uint32_t new_size = self->size + count; + if (new_size > self->capacity) { + uint32_t new_capacity = self->capacity * 2; + if (new_capacity < 8) new_capacity = 8; + if (new_capacity < new_size) new_capacity = new_size; + _array__reserve(self, element_size, new_capacity); + } +} + +/// This is not what you're looking for, see `array_splice`. +static inline void _array__splice(Array *self, size_t element_size, + uint32_t index, uint32_t old_count, + uint32_t new_count, const void *elements) { + uint32_t new_size = self->size + new_count - old_count; + uint32_t old_end = index + old_count; + uint32_t new_end = index + new_count; + assert(old_end <= self->size); + + _array__reserve(self, element_size, new_size); + + char *contents = (char *)self->contents; + if (self->size > old_end) { + memmove( + contents + new_end * element_size, + contents + old_end * element_size, + (self->size - old_end) * element_size + ); + } + if (new_count > 0) { + if (elements) { + memcpy( + (contents + index * element_size), + elements, + new_count * element_size + ); + } else { + memset( + (contents + index * element_size), + 0, + new_count * element_size + ); + } + } + self->size += new_count - old_count; +} + +/// A binary search routine, based on Rust's `std::slice::binary_search_by`. +/// This is not what you're looking for, see `array_search_sorted_with` or `array_search_sorted_by`. +#define _array__search_sorted(self, start, compare, suffix, needle, _index, _exists) \ + do { \ + *(_index) = start; \ + *(_exists) = false; \ + uint32_t size = (self)->size - *(_index); \ + if (size == 0) break; \ + int comparison; \ + while (size > 1) { \ + uint32_t half_size = size / 2; \ + uint32_t mid_index = *(_index) + half_size; \ + comparison = compare(&((self)->contents[mid_index] suffix), (needle)); \ + if (comparison <= 0) *(_index) = mid_index; \ + size -= half_size; \ + } \ + comparison = compare(&((self)->contents[*(_index)] suffix), (needle)); \ + if (comparison == 0) *(_exists) = true; \ + else if (comparison < 0) *(_index) += 1; \ + } while (0) + +/// Helper macro for the `_sorted_by` routines below. This takes the left (existing) +/// parameter by reference in order to work with the generic sorting function above. +#define _compare_int(a, b) ((int)*(a) - (int)(b)) + +#ifdef _MSC_VER +#pragma warning(default : 4101) +#elif defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif + +#ifdef __cplusplus +} +#endif + +#endif // TREE_SITTER_ARRAY_H_ diff --git a/corpus/schema.txt b/test/corpus/schema.txt similarity index 100% rename from corpus/schema.txt rename to test/corpus/schema.txt diff --git a/corpus/spec.txt b/test/corpus/spec.txt similarity index 100% rename from corpus/spec.txt rename to test/corpus/spec.txt