do not regenerate embeddings
This commit is contained in:
commit
e2de886831
|
@ -502,6 +502,12 @@ func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) {
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
|
// this will be used to check if we already have embeddings for a file
|
||||||
|
modelInfo, err := os.Stat(e.model)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to get model file info: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
addedFiles := make(map[string]bool) // keep track of files that have already been added
|
addedFiles := make(map[string]bool) // keep track of files that have already been added
|
||||||
for _, filePattern := range e.files {
|
for _, filePattern := range e.files {
|
||||||
matchingFiles, err := filepath.Glob(filePattern)
|
matchingFiles, err := filepath.Glob(filePattern)
|
||||||
|
@ -514,6 +520,14 @@ func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
addedFiles[filePath] = true
|
addedFiles[filePath] = true
|
||||||
|
// check if we already have embeddings for this file path
|
||||||
|
layerIdentifier := fmt.Sprintf("%s:%s:%s:%d", filePath, e.model, modelInfo.ModTime().Format("2006-01-02 15:04:05"), modelInfo.Size())
|
||||||
|
digest, _ := GetSHA256Digest(strings.NewReader(layerIdentifier))
|
||||||
|
existing, err := existingFileEmbeddings(digest)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to check existing embeddings for file %s: %v", filePath, err)
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: check file type
|
// TODO: check file type
|
||||||
f, err := os.Open(filePath)
|
f, err := os.Open(filePath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -542,6 +556,11 @@ func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) {
|
||||||
Total: len(data) - 1,
|
Total: len(data) - 1,
|
||||||
Completed: i,
|
Completed: i,
|
||||||
})
|
})
|
||||||
|
if len(existing[d]) > 0 {
|
||||||
|
// already have an embedding for this line
|
||||||
|
embeddings = append(embeddings, vector.Embedding{Data: d, Vector: existing[d]})
|
||||||
|
continue
|
||||||
|
}
|
||||||
embed, err := llmModel.Embedding(d)
|
embed, err := llmModel.Embedding(d)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("failed to generate embedding for '%s' line %d: %v", filePath, i+1, err)
|
log.Printf("failed to generate embedding for '%s' line %d: %v", filePath, i+1, err)
|
||||||
|
@ -556,17 +575,11 @@ func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) {
|
||||||
}
|
}
|
||||||
r := bytes.NewReader(b)
|
r := bytes.NewReader(b)
|
||||||
|
|
||||||
digest, size := GetSHA256Digest(r)
|
|
||||||
// Reset the position of the reader after calculating the digest
|
|
||||||
if _, err := r.Seek(0, io.SeekStart); err != nil {
|
|
||||||
return nil, fmt.Errorf("could not reset embed reader: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
layer := &LayerReader{
|
layer := &LayerReader{
|
||||||
Layer: Layer{
|
Layer: Layer{
|
||||||
MediaType: "application/vnd.ollama.image.embed",
|
MediaType: "application/vnd.ollama.image.embed",
|
||||||
Digest: digest,
|
Digest: digest,
|
||||||
Size: size,
|
Size: r.Len(),
|
||||||
},
|
},
|
||||||
Reader: r,
|
Reader: r,
|
||||||
}
|
}
|
||||||
|
@ -578,6 +591,32 @@ func embeddingLayers(e EmbeddingParams) ([]*LayerReader, error) {
|
||||||
return layers, nil
|
return layers, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// existingFileEmbeddings checks if we already have embeddings for a file and loads them into a look-up map
|
||||||
|
func existingFileEmbeddings(digest string) (map[string][]float64, error) {
|
||||||
|
path, err := GetBlobsPath(digest)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("embeddings blobs path: %w", err)
|
||||||
|
}
|
||||||
|
existingFileEmbeddings := make(map[string][]float64)
|
||||||
|
if _, err := os.Stat(path); err == nil {
|
||||||
|
// already have some embeddings for this file, load embeddings previously generated
|
||||||
|
file, err := os.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to open existing embedding file: %s", err)
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
existing := []vector.Embedding{}
|
||||||
|
if err = json.NewDecoder(file).Decode(&existing); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
for _, e := range existing {
|
||||||
|
existingFileEmbeddings[e.Data] = e.Vector
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return existingFileEmbeddings, nil
|
||||||
|
}
|
||||||
|
|
||||||
func removeLayerFromLayers(layers []*LayerReader, mediaType string) []*LayerReader {
|
func removeLayerFromLayers(layers []*LayerReader, mediaType string) []*LayerReader {
|
||||||
j := 0
|
j := 0
|
||||||
for _, l := range layers {
|
for _, l := range layers {
|
||||||
|
@ -598,7 +637,8 @@ func SaveLayers(layers []*LayerReader, fn func(resp api.ProgressResponse), force
|
||||||
}
|
}
|
||||||
|
|
||||||
_, err = os.Stat(fp)
|
_, err = os.Stat(fp)
|
||||||
if os.IsNotExist(err) || force {
|
// note: embed layers are always written since their digest doesnt indicate anything about the contents
|
||||||
|
if os.IsNotExist(err) || force || layer.MediaType == "application/vnd.ollama.image.embed" {
|
||||||
fn(api.ProgressResponse{Status: fmt.Sprintf("writing layer %s", layer.Digest)})
|
fn(api.ProgressResponse{Status: fmt.Sprintf("writing layer %s", layer.Digest)})
|
||||||
|
|
||||||
out, err := os.Create(fp)
|
out, err := os.Create(fp)
|
||||||
|
@ -1181,7 +1221,7 @@ func makeRequest(ctx context.Context, method, url string, headers map[string]str
|
||||||
var ok bool
|
var ok bool
|
||||||
if retries, ok = retryCtx.(int); ok {
|
if retries, ok = retryCtx.(int); ok {
|
||||||
if retries > MaxRetries {
|
if retries > MaxRetries {
|
||||||
return nil, fmt.Errorf("Maximum retries hit; are you sure you have access to this resource?")
|
return nil, fmt.Errorf("maximum retries hit; are you sure you have access to this resource?")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue