server: #5655 - continue to update other slots on embedding concurrent request.

phymbert · phymbert · commit ec243cacb176 · 2024-02-24T13:01:48.000+01:00
server: tests: add multi users embeddings as fixed
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1836,7 +1836,7 @@ struct llama_server_context
                     send_embedding(slot);
                     slot.release();
                     slot.i_batch = -1;
-                    return true;
+                    continue;
                 }
 
                 completion_token_output result;
diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature
@@ -1,36 +1,4 @@
 # List of ongoing issues
 @bug
 Feature: Issues
-    # Issue #5655
-  Scenario: Multi users embeddings
-    Given a server listening on localhost:8080
-    And   a model file stories260K.gguf
-    And   a model alias tinyllama-2
-    And   42 as server seed
-    And   64 KV cache size
-    And   2 slots
-    And   continuous batching
-    And   embeddings extraction
-    Then  the server is starting
-    Then  the server is healthy
-
-    Given a prompt:
-      """
-      Write a very long story about AI.
-      """
-    And a prompt:
-      """
-      Write another very long music lyrics.
-      """
-    And a prompt:
-      """
-      Write a very long poem.
-      """
-    And a prompt:
-      """
-      Write a very long joke.
-      """
-    Given concurrent embedding requests
-    Then the server is busy
-    Then the server is idle
-    Then all embeddings are generated
+  # No confirmed issue at the moment
diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature
@@ -8,6 +8,7 @@ Feature: Parallel
     And   42 as server seed
     And   64 KV cache size
     And   2 slots
+    And   embeddings extraction
     And   continuous batching
     Then  the server is starting
     Then  the server is healthy
@@ -75,3 +76,25 @@ Feature: Parallel
     Then the server is busy
     Then the server is idle
     Then all prompts are predicted
+
+  Scenario: Multi users embeddings
+    Given a prompt:
+      """
+      Write a very long story about AI.
+      """
+    And a prompt:
+      """
+      Write another very long music lyrics.
+      """
+    And a prompt:
+      """
+      Write a very long poem.
+      """
+    And a prompt:
+      """
+      Write a very long joke.
+      """
+    Given concurrent embedding requests
+    Then the server is busy
+    Then the server is idle
+    Then all embeddings are generated

Original file line number	Diff line number	Diff line change
`@@ -1836,7 +1836,7 @@ struct llama_server_context`
`1836`	`1836`	`send_embedding(slot);`
`1837`	`1837`	`slot.release();`
`1838`	`1838`	`slot.i_batch = -1;`
`1839`		`- return true;`
	`1839`	`+ continue;`
`1840`	`1840`	`}`
`1841`	`1841`
`1842`	`1842`	`completion_token_output result;`