fix: disallow invalid whitespaces between scalar components (#1)

This commit is contained in:
Ika 2019-08-20 09:30:20 +08:00 committed by GitHub
parent 46de5998d4
commit e2b28db714
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 2882 additions and 2706 deletions

162
corpus/custom.txt Normal file
View file

@ -0,0 +1,162 @@
================================================================================
VALID - empty file
================================================================================
--------------------------------------------------------------------------------
(root)
================================================================================
INVALID - key/value pair - multiline string for keys are not allowed
================================================================================
"""
invalid
multiline
basic
key
""" = false
'''
invalid
multiline
literal
key
''' = false
--------------------------------------------------------------------------------
(root
(pair
(key) (ERROR) (boolean)
)
(pair
(key) (ERROR) (boolean)
)
)
================================================================================
INVALID - float - whitespaces between its components are not allowed
================================================================================
invalid_fractional = 1 .0
invalid_exponent = 1 e 2
invalid_both = 1 .0 e 2
--------------------------------------------------------------------------------
(ERROR
(dotted_key
(key) (ERROR) (key)
)
)
================================================================================
INVALID - offset date time - whitespaces between its components are not allowed
================================================================================
invalid1 = 1979-05-27 07:32:00 Z
invalid2 = 1979-05-27 T 07:32:00 Z
--------------------------------------------------------------------------------
(ERROR
(key) (ERROR) (local_time) (local_time)
)
================================================================================
INVALID - local date time - whitespaces between its components are not allowed
================================================================================
invalid1 = 1979-05-27 07:32:00
invalid2 = 1979-05-27 T 07:32:00
--------------------------------------------------------------------------------
(root
(pair
(key) (ERROR) (local_time)
)
(pair
(key) (ERROR) (local_time)
)
)
================================================================================
INVALID - table - multiline string for header keys are not allowed
================================================================================
["""
invalid
multiline
basic
key
"""]
['''
invalid
multiline
basic
key
''']
--------------------------------------------------------------------------------
(root
(table
(key) (ERROR)
)
(table
(key) (ERROR)
)
)
================================================================================
INVALID - inline table - newlines outside of pairs are not allowed
================================================================================
key = {
newline = true
}
--------------------------------------------------------------------------------
(root
(pair
(key)
(inline_table
(MISSING "}")
)
)
(pair
(key) (boolean)
)
(ERROR)
)
================================================================================
INVALID - array of tables - multiline string for header keys are not allowed
================================================================================
[["""
invalid
multiline
basic
key
"""]]
[['''
invalid
multiline
basic
key
''']]
--------------------------------------------------------------------------------
(ERROR
(ERROR
(key)
)
(key)
)

View file

@ -41,7 +41,7 @@ key = # INVALID
(key) (key)
(comment) (comment)
(integer (integer
(MISSING _decimal_integer) (MISSING "integer_token1")
) )
) )
) )

View file

@ -2,9 +2,23 @@ const { Charset } = require("regexp-util");
const getInverseRegex = charset => const getInverseRegex = charset =>
new RegExp(`[^${charset.toString().slice(1, -1)}]`); new RegExp(`[^${charset.toString().slice(1, -1)}]`);
const concatRegex = (...regexes) =>
new RegExp(regexes.reduce((a, b) => a.concat(`(${b.source})`), []).join(""));
const control_chars = new Charset([0x0, 0x1f], 0x7f); const control_chars = new Charset([0x0, 0x1f], 0x7f);
const newline_regex = /(\r?\n)+/; const newline = /(\r?\n)+/;
const decimal_integer = /[+-]?(0|[1-9](_?[0-9])*)/;
const hexadecimal_integer = /0x[0-9a-fA-F](_?[0-9a-fA-F])*/;
const octal_integer = /0o[0-7](_?[0-7])*/;
const binary_integer = /0b[01](_?[01])*/;
const float_fractional_part = /[.][0-9](_?[0-9])*/;
const float_exponent_part = concatRegex(/[eE]/, decimal_integer);
const rfc3339_date = /([0-9]+)-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])/;
const rfc3339_delimiter = /[ tT]/;
const rfc3339_time = /([01][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9]|60)([.][0-9]+)?/;
const rfc3339_offset = /([zZ])|([+-]([01][0-9]|2[0-3]):[0-5][0-9])/;
module.exports = grammar({ module.exports = grammar({
name: "toml", name: "toml",
@ -27,7 +41,7 @@ module.exports = grammar({
), ),
comment: $ => /#.*/, comment: $ => /#.*/,
_newline: $ => newline_regex, _newline: $ => newline,
_newline_or_eof: $ => choice($._newline, $._eof), _newline_or_eof: $ => choice($._newline, $._eof),
...table_like("table", "[", "]"), ...table_like("table", "[", "]"),
@ -45,8 +59,7 @@ module.exports = grammar({
key: $ => choice($._bare_key, $._quoted_key), key: $ => choice($._bare_key, $._quoted_key),
dotted_key: $ => seq(choice($.dotted_key, $.key), ".", $.key), dotted_key: $ => seq(choice($.dotted_key, $.key), ".", $.key),
_bare_key: $ => /[A-Za-z0-9_-]+/, _bare_key: $ => /[A-Za-z0-9_-]+/,
_quoted_key: $ => _quoted_key: $ => choice($._basic_string, $._literal_string),
choice($._singleline_basic_string, $._singleline_literal_string),
_inline_value: $ => _inline_value: $ =>
choice( choice(
@ -64,12 +77,12 @@ module.exports = grammar({
string: $ => string: $ =>
choice( choice(
$._singleline_basic_string, $._basic_string,
$._multiline_basic_string, $._multiline_basic_string,
$._singleline_literal_string, $._literal_string,
$._multiline_literal_string $._multiline_literal_string
), ),
_singleline_basic_string: $ => _basic_string: $ =>
seq( seq(
'"', '"',
repeat( repeat(
@ -91,7 +104,7 @@ module.exports = grammar({
repeat1(getInverseRegex(control_chars.union('"', "\\"))) repeat1(getInverseRegex(control_chars.union('"', "\\")))
), ),
token.immediate(/"{1,2}/), token.immediate(/"{1,2}/),
token.immediate(newline_regex), token.immediate(newline),
$.escape_sequence, $.escape_sequence,
alias($._escape_line_ending, $.escape_sequence) alias($._escape_line_ending, $.escape_sequence)
) )
@ -103,7 +116,7 @@ module.exports = grammar({
seq("\\", choice(/[btnfr"\\]/, /u[0-9a-fA-F]{4}/, /U[0-9a-fA-F]{8}/)) seq("\\", choice(/[btnfr"\\]/, /u[0-9a-fA-F]{4}/, /U[0-9a-fA-F]{8}/))
), ),
_escape_line_ending: $ => token.immediate(seq("\\", /\r?\n/)), _escape_line_ending: $ => token.immediate(seq("\\", /\r?\n/)),
_singleline_literal_string: $ => _literal_string: $ =>
seq( seq(
"'", "'",
optional( optional(
@ -122,7 +135,7 @@ module.exports = grammar({
repeat1(getInverseRegex(control_chars.union("'").subtract("\t"))) repeat1(getInverseRegex(control_chars.union("'").subtract("\t")))
), ),
token.immediate(/'{1,2}/), token.immediate(/'{1,2}/),
token.immediate(newline_regex) token.immediate(newline)
) )
), ),
token.immediate("'''") token.immediate("'''")
@ -130,49 +143,44 @@ module.exports = grammar({
integer: $ => integer: $ =>
choice( choice(
$._decimal_integer, decimal_integer,
$._hexadecimal_integer, hexadecimal_integer,
$._octal_integer, octal_integer,
$._binary_integer binary_integer
), ),
_decimal_integer: $ => /[+-]?(0|[1-9](_?[0-9])*)/,
_hexadecimal_integer: $ => /0x[0-9a-fA-F](_?[0-9a-fA-F])*/,
_octal_integer: $ => /0o[0-7](_?[0-7])*/,
_binary_integer: $ => /0b[01](_?[01])*/,
float: $ => float: $ =>
choice( choice(
seq( seq(
$._decimal_integer, decimal_integer,
choice( choice(
seq($._float_fractional_part, optional($._float_exponent_part)), seq(
$._float_exponent_part token.immediate(float_fractional_part),
optional(token.immediate(float_exponent_part))
),
token.immediate(float_exponent_part)
) )
), ),
/[+-]?(inf|nan)/ /[+-]?(inf|nan)/
), ),
_float_fractional_part: $ => /[.][0-9](_?[0-9])*/,
_float_exponent_part: $ => seq(/[eE]/, $._decimal_integer),
boolean: $ => /true|false/, boolean: $ => /true|false/,
offset_date_time: $ => offset_date_time: $ =>
seq( seq(
$._rfc3339_date, rfc3339_date,
$._rfc3339_delimiter, token.immediate(rfc3339_delimiter),
$._rfc3339_time, token.immediate(rfc3339_time),
$._rfc3339_offset token.immediate(rfc3339_offset)
), ),
local_date_time: $ => local_date_time: $ =>
seq($._rfc3339_date, $._rfc3339_delimiter, $._rfc3339_time), seq(
local_date: $ => $._rfc3339_date, rfc3339_date,
local_time: $ => $._rfc3339_time, token.immediate(rfc3339_delimiter),
token.immediate(rfc3339_time)
_rfc3339_date: $ => /([0-9]+)-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])/, ),
_rfc3339_delimiter: $ => /[ tT]/, local_date: $ => rfc3339_date,
_rfc3339_time: $ => local_time: $ => rfc3339_time,
/([01][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9]|60)([.][0-9]+)?/,
_rfc3339_offset: $ => /([zZ])|([+-]([01][0-9]|2[0-3]):[0-5][0-9])/,
array: $ => array: $ =>
seq( seq(

141
src/grammar.json generated
View file

@ -423,11 +423,11 @@
"members": [ "members": [
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "_singleline_basic_string" "name": "_basic_string"
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "_singleline_literal_string" "name": "_literal_string"
} }
] ]
}, },
@ -481,7 +481,7 @@
"members": [ "members": [
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "_singleline_basic_string" "name": "_basic_string"
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
@ -489,7 +489,7 @@
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
"name": "_singleline_literal_string" "name": "_literal_string"
}, },
{ {
"type": "SYMBOL", "type": "SYMBOL",
@ -497,7 +497,7 @@
} }
] ]
}, },
"_singleline_basic_string": { "_basic_string": {
"type": "SEQ", "type": "SEQ",
"members": [ "members": [
{ {
@ -641,7 +641,7 @@
] ]
} }
}, },
"_singleline_literal_string": { "_literal_string": {
"type": "SEQ", "type": "SEQ",
"members": [ "members": [
{ {
@ -727,38 +727,22 @@
"type": "CHOICE", "type": "CHOICE",
"members": [ "members": [
{ {
"type": "SYMBOL",
"name": "_decimal_integer"
},
{
"type": "SYMBOL",
"name": "_hexadecimal_integer"
},
{
"type": "SYMBOL",
"name": "_octal_integer"
},
{
"type": "SYMBOL",
"name": "_binary_integer"
}
]
},
"_decimal_integer": {
"type": "PATTERN", "type": "PATTERN",
"value": "[+-]?(0|[1-9](_?[0-9])*)" "value": "[+-]?(0|[1-9](_?[0-9])*)"
}, },
"_hexadecimal_integer": { {
"type": "PATTERN", "type": "PATTERN",
"value": "0x[0-9a-fA-F](_?[0-9a-fA-F])*" "value": "0x[0-9a-fA-F](_?[0-9a-fA-F])*"
}, },
"_octal_integer": { {
"type": "PATTERN", "type": "PATTERN",
"value": "0o[0-7](_?[0-7])*" "value": "0o[0-7](_?[0-7])*"
}, },
"_binary_integer": { {
"type": "PATTERN", "type": "PATTERN",
"value": "0b[01](_?[01])*" "value": "0b[01](_?[01])*"
}
]
}, },
"float": { "float": {
"type": "CHOICE", "type": "CHOICE",
@ -767,8 +751,8 @@
"type": "SEQ", "type": "SEQ",
"members": [ "members": [
{ {
"type": "SYMBOL", "type": "PATTERN",
"name": "_decimal_integer" "value": "[+-]?(0|[1-9](_?[0-9])*)"
}, },
{ {
"type": "CHOICE", "type": "CHOICE",
@ -777,15 +761,21 @@
"type": "SEQ", "type": "SEQ",
"members": [ "members": [
{ {
"type": "SYMBOL", "type": "IMMEDIATE_TOKEN",
"name": "_float_fractional_part" "content": {
"type": "PATTERN",
"value": "[.][0-9](_?[0-9])*"
}
}, },
{ {
"type": "CHOICE", "type": "CHOICE",
"members": [ "members": [
{ {
"type": "SYMBOL", "type": "IMMEDIATE_TOKEN",
"name": "_float_exponent_part" "content": {
"type": "PATTERN",
"value": "([eE])([+-]?(0|[1-9](_?[0-9])*))"
}
}, },
{ {
"type": "BLANK" "type": "BLANK"
@ -795,8 +785,11 @@
] ]
}, },
{ {
"type": "SYMBOL", "type": "IMMEDIATE_TOKEN",
"name": "_float_exponent_part" "content": {
"type": "PATTERN",
"value": "([eE])([+-]?(0|[1-9](_?[0-9])*))"
}
} }
] ]
} }
@ -808,23 +801,6 @@
} }
] ]
}, },
"_float_fractional_part": {
"type": "PATTERN",
"value": "[.][0-9](_?[0-9])*"
},
"_float_exponent_part": {
"type": "SEQ",
"members": [
{
"type": "PATTERN",
"value": "[eE]"
},
{
"type": "SYMBOL",
"name": "_decimal_integer"
}
]
},
"boolean": { "boolean": {
"type": "PATTERN", "type": "PATTERN",
"value": "true|false" "value": "true|false"
@ -833,20 +809,29 @@
"type": "SEQ", "type": "SEQ",
"members": [ "members": [
{ {
"type": "SYMBOL", "type": "PATTERN",
"name": "_rfc3339_date" "value": "([0-9]+)-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])"
}, },
{ {
"type": "SYMBOL", "type": "IMMEDIATE_TOKEN",
"name": "_rfc3339_delimiter" "content": {
"type": "PATTERN",
"value": "[ tT]"
}
}, },
{ {
"type": "SYMBOL", "type": "IMMEDIATE_TOKEN",
"name": "_rfc3339_time" "content": {
"type": "PATTERN",
"value": "([01][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9]|60)([.][0-9]+)?"
}
}, },
{ {
"type": "SYMBOL", "type": "IMMEDIATE_TOKEN",
"name": "_rfc3339_offset" "content": {
"type": "PATTERN",
"value": "([zZ])|([+-]([01][0-9]|2[0-3]):[0-5][0-9])"
}
} }
] ]
}, },
@ -854,43 +839,33 @@
"type": "SEQ", "type": "SEQ",
"members": [ "members": [
{ {
"type": "SYMBOL", "type": "PATTERN",
"name": "_rfc3339_date" "value": "([0-9]+)-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])"
}, },
{ {
"type": "SYMBOL", "type": "IMMEDIATE_TOKEN",
"name": "_rfc3339_delimiter" "content": {
"type": "PATTERN",
"value": "[ tT]"
}
}, },
{ {
"type": "SYMBOL", "type": "IMMEDIATE_TOKEN",
"name": "_rfc3339_time" "content": {
"type": "PATTERN",
"value": "([01][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9]|60)([.][0-9]+)?"
}
} }
] ]
}, },
"local_date": { "local_date": {
"type": "SYMBOL",
"name": "_rfc3339_date"
},
"local_time": {
"type": "SYMBOL",
"name": "_rfc3339_time"
},
"_rfc3339_date": {
"type": "PATTERN", "type": "PATTERN",
"value": "([0-9]+)-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])" "value": "([0-9]+)-(0[1-9]|1[012])-(0[1-9]|[12][0-9]|3[01])"
}, },
"_rfc3339_delimiter": { "local_time": {
"type": "PATTERN",
"value": "[ tT]"
},
"_rfc3339_time": {
"type": "PATTERN", "type": "PATTERN",
"value": "([01][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9]|60)([.][0-9]+)?" "value": "([01][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9]|60)([.][0-9]+)?"
}, },
"_rfc3339_offset": {
"type": "PATTERN",
"value": "([zZ])|([+-]([01][0-9]|2[0-3]):[0-5][0-9])"
},
"array": { "array": {
"type": "SEQ", "type": "SEQ",
"members": [ "members": [

9
src/node-types.json generated
View file

@ -119,11 +119,6 @@
"named": true, "named": true,
"fields": {} "fields": {}
}, },
{
"type": "local_time",
"named": true,
"fields": {}
},
{ {
"type": "offset_date_time", "type": "offset_date_time",
"named": true, "named": true,
@ -340,6 +335,10 @@
"type": "boolean", "type": "boolean",
"named": true "named": true
}, },
{
"type": "local_time",
"named": true
},
{ {
"type": ",", "type": ",",
"named": false "named": false

5188
src/parser.c generated

File diff suppressed because it is too large Load diff