From 073be09042e28d150832b8526f29705eaf26ae8b Mon Sep 17 00:00:00 2001 From: Luc Georges Date: Sun, 27 Aug 2023 00:21:55 +0200 Subject: [PATCH] feat: add tokenizer for context window size check (#5) * feat: add tokenizer for context window size check * refactor: clippy * refactor: use `format!` instead of mut * refactor: remove unnecessary cloning --- Cargo.lock | 1059 ++++++++++++++++++++++++++++++++++++++++++++++++++- Cargo.toml | 1 + src/main.rs | 234 ++++++++++-- 3 files changed, 1233 insertions(+), 61 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3f3e531..f644dfd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,26 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "aes" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac1f845298e95f983ff1944b728ae08b8cebab80d684f0a832ed0fc74dfa27e2" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + +[[package]] +name = "aho-corasick" +version = "0.7.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" +dependencies = [ + "memchr", +] + [[package]] name = "aho-corasick" version = "1.0.3" @@ -26,6 +46,54 @@ dependencies = [ "memchr", ] +[[package]] +name = "anstream" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f58811cfac344940f1a400b6e6231ce35171f614f26439e80f8c1465c5cc0c" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15c4c2c83f81532e5845a733998b6971faca23490340a418e9b72a3ec9de12ea" + +[[package]] +name = "anstyle-parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca11d4be1bab0c8bc8734a9aa7bf4ee8316d462a08c6ac5052f888fef5b494b" +dependencies = [ + "windows-sys 0.48.0", +] + +[[package]] +name = "anstyle-wincon" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58f54d10c6dfa51283a066ceab3ec1ab78d13fae00aa49243a45e4571fb79dfd" +dependencies = [ + "anstyle", + "windows-sys 0.48.0", +] + [[package]] name = "async-trait" version = "0.1.72" @@ -70,12 +138,24 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d" +[[package]] +name = "base64ct" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" + [[package]] name = "bitflags" version = "1.3.2" @@ -88,24 +168,83 @@ version = "2.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "630be753d4e58660abd17930c71b647fe46c27ea6b63cc59e1e3851406972e42" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "bumpalo" version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" +[[package]] +name = "byteorder" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" + [[package]] name = "bytes" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89b2fd2a0dcf38d7971e2194b6b6eebab45ae01067456a7fd93d5547a61b70be" +[[package]] +name = "bzip2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "cached-path" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "097968e38f1319207f057d0f4d76452e4f4f847a5de61c5215379f297fa034f3" +dependencies = [ + "flate2", + "fs2", + "glob", + "indicatif 0.16.2", + "log", + "rand", + "reqwest", + "serde", + "serde_json", + "sha2", + "tar", + "tempfile", + "thiserror", + "zip", +] + [[package]] name = "cc" version = "1.0.82" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "305fe645edc1442a0fa8b6726ba61d422798d37a52e12eaecf4b022ebbb88f01" dependencies = [ + "jobserver", "libc", ] @@ -115,6 +254,82 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", +] + +[[package]] +name = "clap" +version = "4.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d5f1946157a96594eb2d2c10eb7ad9a2b27518cb3000209dec700c35df9197d" +dependencies = [ + "clap_builder", + "clap_derive", + "once_cell", +] + +[[package]] +name = "clap_builder" +version = "4.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78116e32a042dd73c2901f0dc30790d20ff3447f3e3472fad359e8c3d282bcd6" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9fd1a5729c4548118d7d70ff234a44868d00489a4b6597b0b020918a0e91a1a" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.28", +] + +[[package]] +name = "clap_lex" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd7cc57abe963c6d3b9d8be5b06ba7c8957a930305ca90304f24ef040aa6f961" + +[[package]] +name = "colorchoice" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" + +[[package]] +name = "console" +version = "0.15.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c926e00cc70edefdc64d3a5ff31cc65bb97a3460097762bd23afb4d8145fccf8" +dependencies = [ + "encode_unicode", + "lazy_static", + "libc", + "unicode-width", + "windows-sys 0.45.0", +] + +[[package]] +name = "constant_time_eq" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "245097e9a4535ee1e3e3931fcfcd55a796a44c643e8596ff6566d68f09b87bbc" + [[package]] name = "core-foundation" version = "0.9.3" @@ -131,6 +346,24 @@ version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" +[[package]] +name = "cpufeatures" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a17b76ff3a4162b0b27f354a0c87015ddad39d35f9c0c36607a3bdd175dde1f1" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if", +] + [[package]] name = "crossbeam-channel" version = "0.5.8" @@ -141,6 +374,30 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae211234986c545741a7dc064309f67ee1e5ad243d0e48335adc0484d960bcc7" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "memoffset", + "scopeguard", +] + [[package]] name = "crossbeam-utils" version = "0.8.16" @@ -150,6 +407,51 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "darling" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 1.0.109", +] + +[[package]] +name = "darling_macro" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" +dependencies = [ + "darling_core", + "quote", + "syn 1.0.109", +] + [[package]] name = "dashmap" version = "5.5.0" @@ -169,6 +471,80 @@ version = "0.3.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7684a49fb1af197853ef7b2ee694bc1f5b4179556f1e5710e1760c5db6f5e929" +[[package]] +name = "derive_builder" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "derive_builder_macro" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e" +dependencies = [ + "derive_builder_core", + "syn 1.0.109", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + +[[package]] +name = "dirs" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3aa72a6f96ea37bbc5aa912f6788242832f75369bdfdadcb0e38423f100059" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6" +dependencies = [ + "libc", + "redox_users", + "winapi", +] + +[[package]] +name = "either" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + [[package]] name = "encoding_rs" version = "0.8.32" @@ -186,7 +562,7 @@ checksum = "6b30f669a7961ef1631673d2766cc92f52d64f7ef354d4fe0ddfd30ed52f0f4f" dependencies = [ "errno-dragonfly", "libc", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -199,12 +575,43 @@ dependencies = [ "libc", ] +[[package]] +name = "esaxx-rs" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f748b253ceca9fed5f42f8b5ceb3851e93102199bc25b64b65369f76e5c0a35" +dependencies = [ + "cc", +] + [[package]] name = "fastrand" version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764" +[[package]] +name = "filetime" +version = "0.2.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4029edd3e734da6fe05b6cd7bd2960760a616bd2ddd0d59a0124746d6272af0" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall 0.3.5", + "windows-sys 0.48.0", +] + +[[package]] +name = "flate2" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6c98ee8095e9d1dcbf2fcc6d95acccb90d1c81db1e44725c6a984b1dbdfb010" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" @@ -235,6 +642,16 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "fs2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "futures" version = "0.3.28" @@ -312,12 +729,39 @@ dependencies = [ "slab", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + [[package]] name = "gimli" version = "0.27.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6c80984affa11d98d1b88b66ac8853f143217b399d3c74116778ff8fdb4ed2e" +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "h2" version = "0.3.20" @@ -349,19 +793,34 @@ version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + [[package]] name = "hermit-abi" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "443144c8cdadd93ebf52ddb4056d257f5b52c04d3c804e657d19eb73fc33668b" +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + [[package]] name = "home" version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5444c27eef6923071f7ebcc33e3444508466a76f7a2b93da00ed6e19f30c1ddb" dependencies = [ - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -435,6 +894,12 @@ dependencies = [ "tokio-native-tls", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "0.4.0" @@ -455,18 +920,78 @@ dependencies = [ "hashbrown 0.12.3", ] +[[package]] +name = "indicatif" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7baab56125e25686df467fe470785512329883aab42696d661247aca2a2896e4" +dependencies = [ + "console", + "lazy_static", + "number_prefix 0.3.0", + "regex", +] + +[[package]] +name = "indicatif" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d207dc617c7a380ab07ff572a6e52fa202a2a8f355860ac9c38e23f8196be1b" +dependencies = [ + "console", + "lazy_static", + "number_prefix 0.4.0", + "regex", +] + +[[package]] +name = "inout" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0c10553d664a4d0bcff9f4215d0aac67a639cc68ef660840afe309b807bc9f5" +dependencies = [ + "generic-array", +] + [[package]] name = "ipnet" version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" +[[package]] +name = "itertools" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +[[package]] +name = "jobserver" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" +dependencies = [ + "libc", +] + [[package]] name = "js-sys" version = "0.3.64" @@ -502,6 +1027,7 @@ dependencies = [ "reqwest", "ropey", "serde", + "tokenizers", "tokio", "tower-lsp", "tracing", @@ -538,6 +1064,22 @@ dependencies = [ "url", ] +[[package]] +name = "macro_rules_attribute" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf0c9b980bf4f3a37fd7b1c066941dd1b1d0152ce6ee6e8fe8c49b9f6810d862" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d" + [[package]] name = "matchers" version = "0.1.0" @@ -553,12 +1095,27 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +[[package]] +name = "memoffset" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" +dependencies = [ + "autocfg", +] + [[package]] name = "mime" version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "miniz_oxide" version = "0.7.1" @@ -576,7 +1133,28 @@ checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" dependencies = [ "libc", "wasi", - "windows-sys", + "windows-sys 0.48.0", +] + +[[package]] +name = "monostate" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15f370ae88093ec6b11a710dec51321a61d420fafd1bad6e30d01bd9c920e8ee" +dependencies = [ + "monostate-impl", + "serde", +] + +[[package]] +name = "monostate-impl" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "371717c0a5543d6a800cac822eac735aa7d2d2fbb41002e9856a4089532dbdce" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.28", ] [[package]] @@ -597,6 +1175,16 @@ dependencies = [ "tempfile", ] +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "nu-ansi-term" version = "0.46.0" @@ -617,6 +1205,18 @@ dependencies = [ "libc", ] +[[package]] +name = "number_prefix" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a" + +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + [[package]] name = "object" version = "0.31.1" @@ -632,6 +1232,28 @@ version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +[[package]] +name = "onig" +version = "6.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f" +dependencies = [ + "bitflags 1.3.2", + "libc", + "once_cell", + "onig_sys", +] + +[[package]] +name = "onig_sys" +version = "69.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b829e3d7e9cc74c7e315ee8edb185bf4190da5acde74afd7fc59c35b1f086e7" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "openssl" version = "0.10.56" @@ -690,9 +1312,38 @@ checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.3.5", "smallvec", - "windows-targets", + "windows-targets 0.48.1", +] + +[[package]] +name = "password-hash" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7676374caaee8a325c9e7a2ae557f216c5563a171d6997b0ef8a65af35147700" +dependencies = [ + "base64ct", + "rand_core", + "subtle", +] + +[[package]] +name = "paste" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" + +[[package]] +name = "pbkdf2" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83a0692ec44e4cf1ef28ca317f14f8f07da2d95ec3fa01f86e4467b725e60917" +dependencies = [ + "digest", + "hmac", + "password-hash", + "sha2", ] [[package]] @@ -739,6 +1390,12 @@ version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -781,6 +1438,78 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rayon" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-cond" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd1259362c9065e5ea39a789ef40b1e3fd934c94beb7b5ab3ac6629d3b5e7cb7" +dependencies = [ + "either", + "itertools 0.8.2", + "rayon", +] + +[[package]] +name = "rayon-core" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "num_cpus", +] + +[[package]] +name = "redox_syscall" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "redox_syscall" version = "0.3.5" @@ -790,13 +1519,24 @@ dependencies = [ "bitflags 1.3.2", ] +[[package]] +name = "redox_users" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" +dependencies = [ + "getrandom", + "redox_syscall 0.2.16", + "thiserror", +] + [[package]] name = "regex" version = "1.9.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81bc1d4caf89fac26a70747fe603c130093b53c773888797a6329091246d651a" dependencies = [ - "aho-corasick", + "aho-corasick 1.0.3", "memchr", "regex-automata 0.3.6", "regex-syntax 0.7.4", @@ -817,7 +1557,7 @@ version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fed1ceff11a1dddaee50c9dc8e4938bd106e9d89ae372f192311e7da498e3b69" dependencies = [ - "aho-corasick", + "aho-corasick 1.0.3", "memchr", "regex-syntax 0.7.4", ] @@ -840,7 +1580,7 @@ version = "0.11.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cde824a14b7c14f85caff81225f411faacc04a2013f41670f41443742b1c1c55" dependencies = [ - "base64", + "base64 0.21.2", "bytes", "encoding_rs", "futures-core", @@ -897,7 +1637,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -912,7 +1652,7 @@ version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c3733bf4cf7ea0880754e19cb5a462007c4a8c1914bff372ccc95b464f1df88" dependencies = [ - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -998,6 +1738,28 @@ dependencies = [ "serde", ] +[[package]] +name = "sha1" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f04293dc80c3993519f2d7f6f511707ee7094fe0c6d3406feb330cdb3540eba3" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479fb9d862239e610720565ca91403019f2f00410f1864c5aa7479b950a76ed8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sharded-slab" version = "0.1.4" @@ -1032,12 +1794,36 @@ dependencies = [ "winapi", ] +[[package]] +name = "spm_precompiled" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" +dependencies = [ + "base64 0.13.1", + "nom", + "serde", + "unicode-segmentation", +] + [[package]] name = "str_indices" version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f026164926842ec52deb1938fae44f83dfdb82d0a5b0270c5bd5935ab74d6dd" +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "subtle" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" + [[package]] name = "syn" version = "1.0.109" @@ -1060,6 +1846,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tar" +version = "0.4.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b16afcea1f22891c49a00c751c7b63b2233284064f11a200fc624137c51e2ddb" +dependencies = [ + "filetime", + "libc", + "xattr", +] + [[package]] name = "tempfile" version = "3.7.1" @@ -1068,9 +1865,29 @@ checksum = "dc02fddf48964c42031a0b3fe0428320ecf3a73c401040fc0096f97794310651" dependencies = [ "cfg-if", "fastrand", - "redox_syscall", + "redox_syscall 0.3.5", "rustix", - "windows-sys", + "windows-sys 0.48.0", +] + +[[package]] +name = "thiserror" +version = "1.0.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97a802ec30afc17eee47b2855fc72e0c4cd62be9b4efe6591edde0ec5bd68d8f" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bb623b56e39ab7dcd4b1b98bb6c8f8d907ed255b18de254088016b27a8ee19b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.28", ] [[package]] @@ -1126,6 +1943,42 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tokenizers" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aea68938177975ab09da68552b720eac941779ff386baceaf77e0f5f9cea645f" +dependencies = [ + "aho-corasick 0.7.20", + "cached-path", + "clap", + "derive_builder", + "dirs", + "esaxx-rs", + "getrandom", + "indicatif 0.15.0", + "itertools 0.9.0", + "lazy_static", + "log", + "macro_rules_attribute", + "monostate", + "onig", + "paste", + "rand", + "rayon", + "rayon-cond", + "regex", + "regex-syntax 0.7.4", + "reqwest", + "serde", + "serde_json", + "spm_precompiled", + "thiserror", + "unicode-normalization-alignments", + "unicode-segmentation", + "unicode_categories", +] + [[package]] name = "tokio" version = "1.29.1" @@ -1141,7 +1994,7 @@ dependencies = [ "pin-project-lite", "socket2", "tokio-macros", - "windows-sys", + "windows-sys 0.48.0", ] [[package]] @@ -1331,6 +2184,12 @@ version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3528ecfd12c466c6f163363caf2d02a71161dd5e1cc6ae7b34207ea2d42d81ed" +[[package]] +name = "typenum" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" + [[package]] name = "unicode-bidi" version = "0.3.13" @@ -1352,6 +2211,33 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-normalization-alignments" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" +dependencies = [ + "smallvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" + +[[package]] +name = "unicode-width" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0edd1e5b14653f783770bce4a4dabb4a5108a5370a5f5d8cfe8710c361f6c8b" + +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + [[package]] name = "url" version = "2.4.0" @@ -1364,6 +2250,12 @@ dependencies = [ "serde", ] +[[package]] +name = "utf8parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" + [[package]] name = "valuable" version = "0.1.0" @@ -1495,13 +2387,37 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows-sys" +version = "0.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +dependencies = [ + "windows-targets 0.42.2", +] + [[package]] name = "windows-sys" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets", + "windows-targets 0.48.1", +] + +[[package]] +name = "windows-targets" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +dependencies = [ + "windows_aarch64_gnullvm 0.42.2", + "windows_aarch64_msvc 0.42.2", + "windows_i686_gnu 0.42.2", + "windows_i686_msvc 0.42.2", + "windows_x86_64_gnu 0.42.2", + "windows_x86_64_gnullvm 0.42.2", + "windows_x86_64_msvc 0.42.2", ] [[package]] @@ -1510,51 +2426,93 @@ version = "0.48.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05d4b17490f70499f20b9e791dcf6a299785ce8af4d709018206dc5b4953e95f" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.48.0", + "windows_aarch64_msvc 0.48.0", + "windows_i686_gnu 0.48.0", + "windows_i686_msvc 0.48.0", + "windows_x86_64_gnu 0.48.0", + "windows_x86_64_gnullvm 0.48.0", + "windows_x86_64_msvc 0.48.0", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" +[[package]] +name = "windows_aarch64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" + [[package]] name = "windows_aarch64_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" +[[package]] +name = "windows_i686_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" + [[package]] name = "windows_i686_gnu" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" +[[package]] +name = "windows_i686_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" + [[package]] name = "windows_i686_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" +[[package]] +name = "windows_x86_64_gnu" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" + [[package]] name = "windows_x86_64_gnu" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" +[[package]] +name = "windows_x86_64_msvc" +version = "0.42.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" + [[package]] name = "windows_x86_64_msvc" version = "0.48.0" @@ -1569,3 +2527,62 @@ checksum = "80d0f4e272c85def139476380b12f9ac60926689dd2e01d4923222f40580869d" dependencies = [ "winapi", ] + +[[package]] +name = "xattr" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4686009f71ff3e5c4dbcf1a282d0a44db3f021ba69350cd42086b3e5f1c6985" +dependencies = [ + "libc", +] + +[[package]] +name = "zip" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" +dependencies = [ + "aes", + "byteorder", + "bzip2", + "constant_time_eq", + "crc32fast", + "crossbeam-utils", + "flate2", + "hmac", + "pbkdf2", + "sha1", + "time", + "zstd", +] + +[[package]] +name = "zstd" +version = "0.11.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "5.0.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.8+zstd.1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" +dependencies = [ + "cc", + "libc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml index ceeb119..12d0962 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ home = "0.5" ropey = "1.6" serde = { version = "1", features = ["derive"] } reqwest = { version = "0.11", features = ["json"] } +tokenizers = "0.13" tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "rt-multi-thread"] } tower-lsp = "0.20" tracing = "0.1" diff --git a/src/main.rs b/src/main.rs index 5a02a69..f56ea8f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,7 +6,10 @@ use ropey::Rope; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::fmt::Display; +use std::path::{Path, PathBuf}; use std::sync::Arc; +use tokenizers::Tokenizer; +use tokio::io::AsyncWriteExt; use tokio::sync::RwLock; use tower_lsp::jsonrpc::{Error, Result}; use tower_lsp::lsp_types::*; @@ -104,11 +107,13 @@ impl Document { #[derive(Debug)] struct Backend { + cache_dir: PathBuf, client: Client, document_map: Arc>>, http_client: reqwest::Client, workspace_folders: Arc>>>, language_comments: HashMap, + tokenizer_map: Arc>>, } #[derive(Debug, Deserialize, Serialize)] @@ -124,6 +129,8 @@ struct CompletionParams { fim: FimParams, api_token: Option, model: String, + tokenizer_path: Option, + context_window: usize, } fn internal_error(err: E) -> Error { @@ -138,7 +145,7 @@ fn internal_error(err: E) -> Error { fn file_path_comment( file_url: Url, - file_language_id: String, + file_language_id: &str, workspace_folders: Option<&Vec>, language_comments: &HashMap, ) -> String { @@ -155,47 +162,106 @@ fn file_path_comment( } else { file_path }; - let lc = match language_comments.get(&file_language_id) { + let lc = match language_comments.get(file_language_id) { Some(id) => id.clone(), None => LanguageComment { open: "//".to_owned(), close: None, }, }; - let mut commented_path = lc.open; - commented_path.push(' '); - commented_path.push_str(&path_in_workspace); - if let Some(close) = lc.close { - commented_path.push(' '); - commented_path.push_str(&close); - } - commented_path.push('\n'); - commented_path + let close = if let Some(close) = lc.close { + format!(" {close}") + } else { + "".to_owned() + }; + format!("{} {path_in_workspace}{close}\n", lc.open) } -fn build_prompt(pos: Position, text: &Rope, fim: &FimParams, file_path: String) -> Result { - let mut prompt = file_path; - let cursor_offset = text - .try_line_to_char(pos.line as usize) - .map_err(internal_error)? - + pos.character as usize; - let text_len = text.len_chars(); - // XXX: not sure this is useful, rather be safe than sorry - let cursor_offset = if cursor_offset > text_len { - text_len - } else { - cursor_offset - }; +fn build_prompt( + pos: Position, + text: &Rope, + fim: &FimParams, + file_path: String, + tokenizer: Tokenizer, + context_window: usize, +) -> Result { if fim.enabled { - prompt.push_str(&fim.prefix); - prompt.push_str(&text.slice(0..cursor_offset).to_string()); - prompt.push_str(&fim.suffix); - prompt.push_str(&text.slice(cursor_offset..text_len).to_string()); - prompt.push_str(&fim.middle); - Ok(prompt) + let mut token_count = context_window; + let mut before_iter = text.lines_at(pos.line as usize + 1).reversed(); + let mut after_iter = text.lines_at(pos.line as usize); + let mut before_line = before_iter.next(); + let col = pos.character as usize; + if let Some(line) = before_line { + before_line = Some(line.slice(0..col)); + } + let mut after_line = after_iter.next(); + if let Some(line) = after_line { + after_line = Some(line.slice(col..)); + } + let mut before = vec![]; + let mut after = String::new(); + while before_line.is_some() || after_line.is_some() { + if let Some(before_line) = before_line { + let before_line = before_line.to_string(); + let tokens = tokenizer + .encode(before_line.clone(), false) + .map_err(internal_error)? + .len(); + if tokens > token_count { + break; + } + token_count -= tokens; + before.push(before_line); + } + if let Some(after_line) = after_line { + let after_line = after_line.to_string(); + let tokens = tokenizer + .encode(after_line.clone(), false) + .map_err(internal_error)? + .len(); + if tokens > token_count { + break; + } + token_count -= tokens; + after.push_str(&after_line); + } + before_line = before_iter.next(); + after_line = after_iter.next(); + } + Ok(format!( + "{}{}{}{}{}{}", + file_path, + fim.prefix, + before.into_iter().rev().collect::>().join(""), + fim.suffix, + after, + fim.middle + )) } else { - prompt.push_str(&text.slice(0..cursor_offset).to_string()); - Ok(prompt) + let mut token_count = context_window; + let mut before = vec![]; + let mut first = true; + for mut line in text.lines_at(pos.line as usize).reversed() { + if first { + line = line.slice(0..pos.character as usize); + first = false; + } + let line = line.to_string(); + let tokens = tokenizer + .encode(line.clone(), false) + .map_err(internal_error)? + .len(); + if tokens > token_count { + break; + } + token_count -= tokens; + before.push(line); + } + Ok(format!( + "{}{}", + file_path, + &before.into_iter().rev().collect::>().join("") + )) } } @@ -203,15 +269,15 @@ async fn request_completion( http_client: &reqwest::Client, model: &str, request_params: RequestParams, - api_token: Option, + api_token: Option<&String>, prompt: String, ) -> Result> { - let mut req = http_client.post(model).json(&APIRequest { + let mut req = http_client.post(build_url(model)).json(&APIRequest { inputs: prompt, parameters: request_params.into(), }); - if let Some(api_token) = api_token.clone() { + if let Some(api_token) = api_token { req = req.header(AUTHORIZATION, format!("Bearer {api_token}")) } @@ -232,6 +298,82 @@ fn parse_generations(generations: Vec, stop_token: &str) -> Vec, + to: impl AsRef, +) -> Result<()> { + if to.as_ref().exists() { + return Ok(()); + } + tokio::fs::create_dir_all( + to.as_ref() + .parent() + .ok_or_else(|| internal_error("tokenizer path has no parent"))?, + ) + .await + .map_err(internal_error)?; + let mut req = http_client.get(format!( + "https://huggingface.co/{model}/resolve/main/tokenizer.json" + )); + if let Some(api_token) = api_token { + req = req.header(AUTHORIZATION, format!("Bearer {api_token}")) + } + let res = req + .send() + .await + .map_err(internal_error)? + .error_for_status() + .map_err(internal_error)?; + let mut file = tokio::fs::OpenOptions::new() + .write(true) + .create(true) + .open(to) + .await + .map_err(internal_error)?; + file.write_all(&res.bytes().await.map_err(internal_error)?) + .await + .map_err(internal_error)?; + Ok(()) +} + +async fn get_tokenizer( + model: &str, + tokenizer_map: &mut HashMap, + tokenizer_path: Option<&String>, + http_client: &reqwest::Client, + cache_dir: impl AsRef, + api_token: Option<&String>, +) -> Result { + if model.starts_with("http://") || model.starts_with("https://") { + let tokenizer = match tokenizer_path { + Some(path) => Tokenizer::from_file(path).map_err(internal_error)?, + None => return Err(internal_error("`tokenizer_path` is null")), + }; + Ok(tokenizer) + } else { + match tokenizer_map.get(model) { + Some(tokenizer) => Ok(tokenizer.clone()), + None => { + let path = cache_dir.as_ref().join(model).join("tokenizer.json"); + download_tokenizer_file(http_client, model, api_token, &path).await?; + let tokenizer = Tokenizer::from_file(path).map_err(internal_error)?; + tokenizer_map.insert(model.to_owned(), tokenizer.clone()); + Ok(tokenizer) + } + } + } +} + +fn build_url(model: &str) -> String { + if model.starts_with("http://") || model.starts_with("https://") { + model.to_owned() + } else { + format!("https://api-inference.huggingface.co/models/{model}") + } +} + impl Backend { async fn get_completions(&self, params: CompletionParams) -> Result> { info!("get_completions {params:?}"); @@ -242,23 +384,34 @@ impl Backend { .ok_or_else(|| internal_error("failed to find document"))?; let file_path = file_path_comment( params.text_document_position.text_document.uri, - document.language_id.clone(), + &document.language_id, self.workspace_folders.read().await.as_ref(), &self.language_comments, ); + let tokenizer = get_tokenizer( + ¶ms.model, + &mut *self.tokenizer_map.write().await, + params.tokenizer_path.as_ref(), + &self.http_client, + &self.cache_dir, + params.api_token.as_ref(), + ) + .await?; let prompt = build_prompt( params.text_document_position.position, &document.text, ¶ms.fim, file_path, + tokenizer, + params.context_window, )?; let stop_token = params.request_params.stop_token.clone(); let result = request_completion( &self.http_client, ¶ms.model, params.request_params, - params.api_token, - prompt.clone(), + params.api_token.as_ref(), + prompt, ) .await?; @@ -281,7 +434,6 @@ impl LanguageServer for Backend { )), ..Default::default() }, - ..Default::default() }) } @@ -358,7 +510,7 @@ async fn main() { .await .expect("failed to create cache dir"); - let log_file = rolling::never(cache_dir, "llm-ls.log"); + let log_file = rolling::never(&cache_dir, "llm-ls.log"); let builder = tracing_subscriber::fmt() .with_writer(log_file) .with_target(true) @@ -377,11 +529,13 @@ async fn main() { let http_client = reqwest::Client::new(); let (service, socket) = LspService::build(|client| Backend { + cache_dir, client, document_map: Arc::new(RwLock::new(HashMap::new())), http_client, workspace_folders: Arc::new(RwLock::new(None)), language_comments: build_language_comments(), + tokenizer_map: Arc::new(RwLock::new(HashMap::new())), }) .custom_method("llm-ls/getCompletions", Backend::get_completions) .finish();