fix: disallow invalid whitespaces between scalar components (#1)

This commit is contained in:
Ika 2019-08-20 09:30:20 +08:00 committed by GitHub
parent 46de5998d4
commit e2b28db714
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 2882 additions and 2706 deletions

162
corpus/custom.txt Normal file
View file

@ -0,0 +1,162 @@
================================================================================
VALID - empty file
================================================================================
--------------------------------------------------------------------------------
(root)
================================================================================
INVALID - key/value pair - multiline string for keys are not allowed
================================================================================
"""
invalid
multiline
basic
key
""" = false
'''
invalid
multiline
literal
key
''' = false
--------------------------------------------------------------------------------
(root
(pair
(key) (ERROR) (boolean)
)
(pair
(key) (ERROR) (boolean)
)
)
================================================================================
INVALID - float - whitespaces between its components are not allowed
================================================================================
invalid_fractional = 1 .0
invalid_exponent = 1 e 2
invalid_both = 1 .0 e 2
--------------------------------------------------------------------------------
(ERROR
(dotted_key
(key) (ERROR) (key)
)
)
================================================================================
INVALID - offset date time - whitespaces between its components are not allowed
================================================================================
invalid1 = 1979-05-27 07:32:00 Z
invalid2 = 1979-05-27 T 07:32:00 Z
--------------------------------------------------------------------------------
(ERROR
(key) (ERROR) (local_time) (local_time)
)
================================================================================
INVALID - local date time - whitespaces between its components are not allowed
================================================================================
invalid1 = 1979-05-27 07:32:00
invalid2 = 1979-05-27 T 07:32:00
--------------------------------------------------------------------------------
(root
(pair
(key) (ERROR) (local_time)
)
(pair
(key) (ERROR) (local_time)
)
)
================================================================================
INVALID - table - multiline string for header keys are not allowed
================================================================================
["""
invalid
multiline
basic
key
"""]
['''
invalid
multiline
basic
key
''']
--------------------------------------------------------------------------------
(root
(table
(key) (ERROR)
)
(table
(key) (ERROR)
)
)
================================================================================
INVALID - inline table - newlines outside of pairs are not allowed
================================================================================
key = {
newline = true
}
--------------------------------------------------------------------------------
(root
(pair
(key)
(inline_table
(MISSING "}")
)
)
(pair
(key) (boolean)
)
(ERROR)
)
================================================================================
INVALID - array of tables - multiline string for header keys are not allowed
================================================================================
[["""
invalid
multiline
basic
key
"""]]
[['''
invalid
multiline
basic
key
''']]
--------------------------------------------------------------------------------
(ERROR
(ERROR
(key)
)
(key)
)

View file

@ -41,7 +41,7 @@ key = # INVALID
(key)
(comment)
(integer
(MISSING _decimal_integer)
(MISSING "integer_token1")
)
)
)

View file

@ -2,9 +2,23 @@ const { Charset } = require("regexp-util");
const getInverseRegex = charset =>
new RegExp(`[^${charset.toString().slice(1, -1)}]`);
const concatRegex = (...regexes) =>
new RegExp(regexes.reduce((a, b) => a.concat(`(${b.source})`), []).join(""));
const control_chars = new Charset([0x0, 0x1f], 0x7f);
const newline_regex = /(\r?\n)+/;
const newline = /(\r?\n)+/;
const decimal_integer = /[+-]?(0|[1-9](_?[0-9])*)/;
const hexadecimal_integer = /0x[0-9a-fA-F](_?[0-9a-fA-F])*/;
const octal_integer = /0o[0-7](_?[0-7])*/;
const binary_integer = /0b[01](_?[01])*/;
const float_fractional_part = /[.][0-9](_?[0-9])*/;
const float_exponent_part = concatRegex(/[eE]/, decimal_integer);
const rfc3339_date = /([0-9]+)-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])/;
const rfc3339_delimiter = /[ tT]/;
const rfc3339_time = /([01][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9]|60)([.][0-9]+)?/;
const rfc3339_offset = /([zZ])|([+-]([01][0-9]|2[0-3]):[0-5][0-9])/;
module.exports = grammar({
name: "toml",
@ -27,7 +41,7 @@ module.exports = grammar({
),
comment: $ => /#.*/,
_newline: $ => newline_regex,
_newline: $ => newline,
_newline_or_eof: $ => choice($._newline, $._eof),
...table_like("table", "[", "]"),
@ -45,8 +59,7 @@ module.exports = grammar({
key: $ => choice($._bare_key, $._quoted_key),
dotted_key: $ => seq(choice($.dotted_key, $.key), ".", $.key),
_bare_key: $ => /[A-Za-z0-9_-]+/,
_quoted_key: $ =>
choice($._singleline_basic_string, $._singleline_literal_string),
_quoted_key: $ => choice($._basic_string, $._literal_string),
_inline_value: $ =>
choice(
@ -64,12 +77,12 @@ module.exports = grammar({
string: $ =>
choice(
$._singleline_basic_string,
$._basic_string,
$._multiline_basic_string,
$._singleline_literal_string,
$._literal_string,
$._multiline_literal_string
),
_singleline_basic_string: $ =>
_basic_string: $ =>
seq(
'"',
repeat(
@ -91,7 +104,7 @@ module.exports = grammar({
repeat1(getInverseRegex(control_chars.union('"', "\\")))
),
token.immediate(/"{1,2}/),
token.immediate(newline_regex),
token.immediate(newline),
$.escape_sequence,
alias($._escape_line_ending, $.escape_sequence)
)
@ -103,7 +116,7 @@ module.exports = grammar({
seq("\\", choice(/[btnfr"\\]/, /u[0-9a-fA-F]{4}/, /U[0-9a-fA-F]{8}/))
),
_escape_line_ending: $ => token.immediate(seq("\\", /\r?\n/)),
_singleline_literal_string: $ =>
_literal_string: $ =>
seq(
"'",
optional(
@ -122,7 +135,7 @@ module.exports = grammar({
repeat1(getInverseRegex(control_chars.union("'").subtract("\t")))
),
token.immediate(/'{1,2}/),
token.immediate(newline_regex)
token.immediate(newline)
)
),
token.immediate("'''")
@ -130,49 +143,44 @@ module.exports = grammar({
integer: $ =>
choice(
$._decimal_integer,
$._hexadecimal_integer,
$._octal_integer,
$._binary_integer
decimal_integer,
hexadecimal_integer,
octal_integer,
binary_integer
),
_decimal_integer: $ => /[+-]?(0|[1-9](_?[0-9])*)/,
_hexadecimal_integer: $ => /0x[0-9a-fA-F](_?[0-9a-fA-F])*/,
_octal_integer: $ => /0o[0-7](_?[0-7])*/,
_binary_integer: $ => /0b[01](_?[01])*/,
float: $ =>
choice(
seq(
$._decimal_integer,
decimal_integer,
choice(
seq($._float_fractional_part, optional($._float_exponent_part)),
$._float_exponent_part
seq(
token.immediate(float_fractional_part),
optional(token.immediate(float_exponent_part))
),
token.immediate(float_exponent_part)
)
),
/[+-]?(inf|nan)/
),
_float_fractional_part: $ => /[.][0-9](_?[0-9])*/,
_float_exponent_part: $ => seq(/[eE]/, $._decimal_integer),
boolean: $ => /true|false/,
offset_date_time: $ =>
seq(
$._rfc3339_date,
$._rfc3339_delimiter,
$._rfc3339_time,
$._rfc3339_offset
rfc3339_date,
token.immediate(rfc3339_delimiter),
token.immediate(rfc3339_time),
token.immediate(rfc3339_offset)
),
local_date_time: $ =>
seq($._rfc3339_date, $._rfc3339_delimiter, $._rfc3339_time),
local_date: $ => $._rfc3339_date,
local_time: $ => $._rfc3339_time,
_rfc3339_date: $ => /([0-9]+)-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])/,
_rfc3339_delimiter: $ => /[ tT]/,
_rfc3339_time: $ =>
/([01][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9]|60)([.][0-9]+)?/,
_rfc3339_offset: $ => /([zZ])|([+-]([01][0-9]|2[0-3]):[0-5][0-9])/,
seq(
rfc3339_date,
token.immediate(rfc3339_delimiter),
token.immediate(rfc3339_time)
),
local_date: $ => rfc3339_date,
local_time: $ => rfc3339_time,
array: $ =>
seq(

147
src/grammar.json generated
View file

@ -423,11 +423,11 @@
"members": [
{
"type": "SYMBOL",
"name": "_singleline_basic_string"
"name": "_basic_string"
},
{
"type": "SYMBOL",
"name": "_singleline_literal_string"
"name": "_literal_string"
}
]
},
@ -481,7 +481,7 @@
"members": [
{
"type": "SYMBOL",
"name": "_singleline_basic_string"
"name": "_basic_string"
},
{
"type": "SYMBOL",
@ -489,7 +489,7 @@
},
{
"type": "SYMBOL",
"name": "_singleline_literal_string"
"name": "_literal_string"
},
{
"type": "SYMBOL",
@ -497,7 +497,7 @@
}
]
},
"_singleline_basic_string": {
"_basic_string": {
"type": "SEQ",
"members": [
{
@ -641,7 +641,7 @@
]
}
},
"_singleline_literal_string": {
"_literal_string": {
"type": "SEQ",
"members": [
{
@ -727,39 +727,23 @@
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_decimal_integer"
"type": "PATTERN",
"value": "[+-]?(0|[1-9](_?[0-9])*)"
},
{
"type": "SYMBOL",
"name": "_hexadecimal_integer"
"type": "PATTERN",
"value": "0x[0-9a-fA-F](_?[0-9a-fA-F])*"
},
{
"type": "SYMBOL",
"name": "_octal_integer"
"type": "PATTERN",
"value": "0o[0-7](_?[0-7])*"
},
{
"type": "SYMBOL",
"name": "_binary_integer"
"type": "PATTERN",
"value": "0b[01](_?[01])*"
}
]
},
"_decimal_integer": {
"type": "PATTERN",
"value": "[+-]?(0|[1-9](_?[0-9])*)"
},
"_hexadecimal_integer": {
"type": "PATTERN",
"value": "0x[0-9a-fA-F](_?[0-9a-fA-F])*"
},
"_octal_integer": {
"type": "PATTERN",
"value": "0o[0-7](_?[0-7])*"
},
"_binary_integer": {
"type": "PATTERN",
"value": "0b[01](_?[01])*"
},
"float": {
"type": "CHOICE",
"members": [
@ -767,8 +751,8 @@
"type": "SEQ",
"members": [
{
"type": "SYMBOL",
"name": "_decimal_integer"
"type": "PATTERN",
"value": "[+-]?(0|[1-9](_?[0-9])*)"
},
{
"type": "CHOICE",
@ -777,15 +761,21 @@
"type": "SEQ",
"members": [
{
"type": "SYMBOL",
"name": "_float_fractional_part"
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PATTERN",
"value": "[.][0-9](_?[0-9])*"
}
},
{
"type": "CHOICE",
"members": [
{
"type": "SYMBOL",
"name": "_float_exponent_part"
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PATTERN",
"value": "([eE])([+-]?(0|[1-9](_?[0-9])*))"
}
},
{
"type": "BLANK"
@ -795,8 +785,11 @@
]
},
{
"type": "SYMBOL",
"name": "_float_exponent_part"
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PATTERN",
"value": "([eE])([+-]?(0|[1-9](_?[0-9])*))"
}
}
]
}
@ -808,23 +801,6 @@
}
]
},
"_float_fractional_part": {
"type": "PATTERN",
"value": "[.][0-9](_?[0-9])*"
},
"_float_exponent_part": {
"type": "SEQ",
"members": [
{
"type": "PATTERN",
"value": "[eE]"
},
{
"type": "SYMBOL",
"name": "_decimal_integer"
}
]
},
"boolean": {
"type": "PATTERN",
"value": "true|false"
@ -833,20 +809,29 @@
"type": "SEQ",
"members": [
{
"type": "SYMBOL",
"name": "_rfc3339_date"
"type": "PATTERN",
"value": "([0-9]+)-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])"
},
{
"type": "SYMBOL",
"name": "_rfc3339_delimiter"
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PATTERN",
"value": "[ tT]"
}
},
{
"type": "SYMBOL",
"name": "_rfc3339_time"
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PATTERN",
"value": "([01][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9]|60)([.][0-9]+)?"
}
},
{
"type": "SYMBOL",
"name": "_rfc3339_offset"
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PATTERN",
"value": "([zZ])|([+-]([01][0-9]|2[0-3]):[0-5][0-9])"
}
}
]
},
@ -854,43 +839,33 @@
"type": "SEQ",
"members": [
{
"type": "SYMBOL",
"name": "_rfc3339_date"
"type": "PATTERN",
"value": "([0-9]+)-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])"
},
{
"type": "SYMBOL",
"name": "_rfc3339_delimiter"
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PATTERN",
"value": "[ tT]"
}
},
{
"type": "SYMBOL",
"name": "_rfc3339_time"
"type": "IMMEDIATE_TOKEN",
"content": {
"type": "PATTERN",
"value": "([01][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9]|60)([.][0-9]+)?"
}
}
]
},
"local_date": {
"type": "SYMBOL",
"name": "_rfc3339_date"
},
"local_time": {
"type": "SYMBOL",
"name": "_rfc3339_time"
},
"_rfc3339_date": {
"type": "PATTERN",
"value": "([0-9]+)-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])"
},
"_rfc3339_delimiter": {
"type": "PATTERN",
"value": "[ tT]"
},
"_rfc3339_time": {
"local_time": {
"type": "PATTERN",
"value": "([01][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9]|60)([.][0-9]+)?"
},
"_rfc3339_offset": {
"type": "PATTERN",
"value": "([zZ])|([+-]([01][0-9]|2[0-3]):[0-5][0-9])"
},
"array": {
"type": "SEQ",
"members": [

9
src/node-types.json generated
View file

@ -119,11 +119,6 @@
"named": true,
"fields": {}
},
{
"type": "local_time",
"named": true,
"fields": {}
},
{
"type": "offset_date_time",
"named": true,
@ -340,6 +335,10 @@
"type": "boolean",
"named": true
},
{
"type": "local_time",
"named": true
},
{
"type": ",",
"named": false

5188
src/parser.c generated

File diff suppressed because it is too large Load diff