diff --git a/output/openapi/elasticsearch-openapi.json b/output/openapi/elasticsearch-openapi.json index 0117e20b40..8b4c44e41b 100644 --- a/output/openapi/elasticsearch-openapi.json +++ b/output/openapi/elasticsearch-openapi.json @@ -17916,6 +17916,118 @@ "x-state": "Added in 8.12.0" } }, + "/_inference/{task_type}/{elasticsearch_inference_id}": { + "put": { + "tags": [ + "inference" + ], + "summary": "Create an OpenAI inference endpoint", + "description": "Create an inference endpoint to perform an inference task with the `elasticsearch` service.\n\n> info\n> Your Elasticsearch deployment contains preconfigured ELSER and E5 inference endpoints, you only need to create the enpoints using the API if you want to customize the settings.\n\nIf you use the ELSER or the E5 model through the `elasticsearch` service, the API request will automatically download and deploy the model if it isn't downloaded yet.\n\n> info\n> You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value.\n\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "operationId": "inference-put-elasticsearch", + "parameters": [ + { + "in": "path", + "name": "task_type", + "description": "The type of the inference task that the model will perform.", + "required": true, + "deprecated": false, + "schema": { + "$ref": "#/components/schemas/inference.put_elasticsearch:ElasticsearchTaskType" + }, + "style": "simple" + }, + { + "in": "path", + "name": "elasticsearch_inference_id", + "description": "The unique identifier of the inference endpoint.\nThe must not match the `model_id`.", + "required": true, + "deprecated": false, + "schema": { + "$ref": "#/components/schemas/_types:Id" + }, + "style": "simple" + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "chunking_settings": { + "$ref": "#/components/schemas/inference._types:InferenceChunkingSettings" + }, + "service": { + "$ref": "#/components/schemas/inference.put_elasticsearch:ServiceType" + }, + "service_settings": { + "$ref": "#/components/schemas/inference.put_elasticsearch:ElasticsearchServiceSettings" + }, + "task_settings": { + "$ref": "#/components/schemas/inference.put_elasticsearch:ElasticsearchTaskSettings" + } + }, + "required": [ + "service", + "service_settings" + ] + }, + "examples": { + "PutElasticsearchRequestExample1": { + "summary": "ELSER sparse embedding task", + "description": "Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task. The `model_id` must be the ID of one of the built-in ELSER models. The API will automatically download the ELSER model if it isn't already downloaded and then deploy the model.", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"adaptive_allocations\": { \n \"enabled\": true,\n \"min_number_of_allocations\": 1,\n \"max_number_of_allocations\": 4\n },\n \"num_threads\": 1,\n \"model_id\": \".elser_model_2\" \n }\n}" + }, + "PutElasticsearchRequestExample2": { + "summary": "Elastic rerank task", + "description": "Run `PUT _inference/rerank/my-elastic-rerank` to create an inference endpoint that performs a rerank task using the built-in Elastic Rerank cross-encoder model. The `model_id` must be `.rerank-v1`, which is the ID of the built-in Elastic Rerank model. The API will automatically download the Elastic Rerank model if it isn't already downloaded and then deploy the model. Once deployed, the model can be used for semantic re-ranking with a `text_similarity_reranker` retriever.", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"model_id\": \".rerank-v1\", \n \"num_threads\": 1,\n \"adaptive_allocations\": { \n \"enabled\": true,\n \"min_number_of_allocations\": 1,\n \"max_number_of_allocations\": 4\n }\n }\n}" + }, + "PutElasticsearchRequestExample3": { + "summary": "E5 text embedding task", + "description": "Run `PUT _inference/text_embedding/my-e5-model` to create an inference endpoint that performs a `text_embedding` task. The `model_id` must be the ID of one of the built-in E5 models. The API will automatically download the E5 model if it isn't already downloaded and then deploy the model.", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1,\n \"model_id\": \".multilingual-e5-small\" \n }\n}" + }, + "PutElasticsearchRequestExample4": { + "summary": "Eland text embedding task", + "description": "Run `PUT _inference/text_embedding/my-msmarco-minilm-model` to create an inference endpoint that performs a `text_embedding` task with a model that was uploaded by Eland.", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1,\n \"model_id\": \"msmarco-MiniLM-L12-cos-v5\" \n }\n}" + }, + "PutElasticsearchRequestExample5": { + "summary": "Adaptive allocation", + "description": "Run `PUT _inference/text_embedding/my-e5-model` to create an inference endpoint that performs a `text_embedding` task and to configure adaptive allocations. The API request will automatically download the E5 model if it isn't already downloaded and then deploy the model.", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"adaptive_allocations\": {\n \"enabled\": true,\n \"min_number_of_allocations\": 3,\n \"max_number_of_allocations\": 10\n },\n \"num_threads\": 1,\n \"model_id\": \".multilingual-e5-small\"\n }\n}" + }, + "PutElasticsearchRequestExample6": { + "summary": "Existing model deployment", + "description": "Run `PUT _inference/sparse_embedding/use_existing_deployment` to use an already existing model deployment when creating an inference endpoint.", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"deployment_id\": \".elser_model_2\"\n }\n}" + } + } + } + } + }, + "responses": { + "200": { + "description": "", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/inference._types:InferenceEndpointInfo" + }, + "examples": { + "PutElasticsearchResponseExample1": { + "description": "A successful response from `PUT _inference/sparse_embedding/use_existing_deployment`. It contains the model ID and the threads and allocations settings from the model deployment.\n", + "value": "{\n \"inference_id\": \"use_existing_deployment\",\n \"task_type\": \"sparse_embedding\",\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"num_allocations\": 2,\n \"num_threads\": 1,\n \"model_id\": \".elser_model_2\",\n \"deployment_id\": \".elser_model_2\"\n },\n \"chunking_settings\": {\n \"strategy\": \"sentence\",\n \"max_chunk_size\": 250,\n \"sentence_overlap\": 1\n }\n}" + } + } + } + } + } + }, + "x-state": "Added in 8.13.0" + } + }, "/_inference/{task_type}/{elser_inference_id}": { "put": { "tags": [ @@ -77636,6 +77748,77 @@ "model_id" ] }, + "inference.put_elasticsearch:ElasticsearchTaskType": { + "type": "string", + "enum": [ + "rerank", + "sparse_embedding", + "text_embedding" + ] + }, + "inference.put_elasticsearch:ServiceType": { + "type": "string", + "enum": [ + "elasticsearch" + ] + }, + "inference.put_elasticsearch:ElasticsearchServiceSettings": { + "type": "object", + "properties": { + "adaptive_allocations": { + "$ref": "#/components/schemas/inference.put_elasticsearch:AdaptiveAllocations" + }, + "deployment_id": { + "description": "The deployment identifier for a trained model deployment.\nWhen `deployment_id` is used the `model_id` is optional.", + "type": "string" + }, + "model_id": { + "externalDocs": { + "url": "https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-import-model.html#ml-nlp-import-script" + }, + "description": "The name of the model to use for the inference task.\nIt can be the ID of a built-in model (for example, `.multilingual-e5-small` for E5) or a text embedding model that was uploaded by using the Eland client.", + "type": "string" + }, + "num_allocations": { + "description": "The total number of allocations that are assigned to the model across machine learning nodes.\nIncreasing this value generally increases the throughput.\nIf adaptive allocations are enabled, do not set this value because it's automatically set.", + "type": "number" + }, + "num_threads": { + "description": "The number of threads used by each model allocation during inference.\nThis setting generally increases the speed per inference request.\nThe inference process is a compute-bound process; `threads_per_allocations` must not exceed the number of available allocated processors per node.\nThe value must be a power of 2.\nThe maximum value is 32.", + "type": "number" + } + }, + "required": [ + "model_id", + "num_threads" + ] + }, + "inference.put_elasticsearch:AdaptiveAllocations": { + "type": "object", + "properties": { + "enabled": { + "description": "Turn on `adaptive_allocations`.", + "type": "boolean" + }, + "max_number_of_allocations": { + "description": "The maximum number of allocations to scale to.\nIf set, it must be greater than or equal to `min_number_of_allocations`.", + "type": "number" + }, + "min_number_of_allocations": { + "description": "The minimum number of allocations to scale to.\nIf set, it must be greater than or equal to 0.\nIf not defined, the deployment scales to 0.", + "type": "number" + } + } + }, + "inference.put_elasticsearch:ElasticsearchTaskSettings": { + "type": "object", + "properties": { + "return_documents": { + "description": "For a `rerank` task, return the document instead of only the index.", + "type": "boolean" + } + } + }, "inference.put_elser:ElserTaskType": { "type": "string", "enum": [ diff --git a/output/openapi/elasticsearch-serverless-openapi.json b/output/openapi/elasticsearch-serverless-openapi.json index 355905c31f..ea63b99c01 100644 --- a/output/openapi/elasticsearch-serverless-openapi.json +++ b/output/openapi/elasticsearch-serverless-openapi.json @@ -9738,6 +9738,118 @@ "x-state": "Added in 8.12.0" } }, + "/_inference/{task_type}/{elasticsearch_inference_id}": { + "put": { + "tags": [ + "inference" + ], + "summary": "Create an OpenAI inference endpoint", + "description": "Create an inference endpoint to perform an inference task with the `elasticsearch` service.\n\n> info\n> Your Elasticsearch deployment contains preconfigured ELSER and E5 inference endpoints, you only need to create the enpoints using the API if you want to customize the settings.\n\nIf you use the ELSER or the E5 model through the `elasticsearch` service, the API request will automatically download and deploy the model if it isn't downloaded yet.\n\n> info\n> You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value.\n\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "operationId": "inference-put-elasticsearch", + "parameters": [ + { + "in": "path", + "name": "task_type", + "description": "The type of the inference task that the model will perform.", + "required": true, + "deprecated": false, + "schema": { + "$ref": "#/components/schemas/inference.put_elasticsearch:ElasticsearchTaskType" + }, + "style": "simple" + }, + { + "in": "path", + "name": "elasticsearch_inference_id", + "description": "The unique identifier of the inference endpoint.\nThe must not match the `model_id`.", + "required": true, + "deprecated": false, + "schema": { + "$ref": "#/components/schemas/_types:Id" + }, + "style": "simple" + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "chunking_settings": { + "$ref": "#/components/schemas/inference._types:InferenceChunkingSettings" + }, + "service": { + "$ref": "#/components/schemas/inference.put_elasticsearch:ServiceType" + }, + "service_settings": { + "$ref": "#/components/schemas/inference.put_elasticsearch:ElasticsearchServiceSettings" + }, + "task_settings": { + "$ref": "#/components/schemas/inference.put_elasticsearch:ElasticsearchTaskSettings" + } + }, + "required": [ + "service", + "service_settings" + ] + }, + "examples": { + "PutElasticsearchRequestExample1": { + "summary": "ELSER sparse embedding task", + "description": "Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task. The `model_id` must be the ID of one of the built-in ELSER models. The API will automatically download the ELSER model if it isn't already downloaded and then deploy the model.", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"adaptive_allocations\": { \n \"enabled\": true,\n \"min_number_of_allocations\": 1,\n \"max_number_of_allocations\": 4\n },\n \"num_threads\": 1,\n \"model_id\": \".elser_model_2\" \n }\n}" + }, + "PutElasticsearchRequestExample2": { + "summary": "Elastic rerank task", + "description": "Run `PUT _inference/rerank/my-elastic-rerank` to create an inference endpoint that performs a rerank task using the built-in Elastic Rerank cross-encoder model. The `model_id` must be `.rerank-v1`, which is the ID of the built-in Elastic Rerank model. The API will automatically download the Elastic Rerank model if it isn't already downloaded and then deploy the model. Once deployed, the model can be used for semantic re-ranking with a `text_similarity_reranker` retriever.", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"model_id\": \".rerank-v1\", \n \"num_threads\": 1,\n \"adaptive_allocations\": { \n \"enabled\": true,\n \"min_number_of_allocations\": 1,\n \"max_number_of_allocations\": 4\n }\n }\n}" + }, + "PutElasticsearchRequestExample3": { + "summary": "E5 text embedding task", + "description": "Run `PUT _inference/text_embedding/my-e5-model` to create an inference endpoint that performs a `text_embedding` task. The `model_id` must be the ID of one of the built-in E5 models. The API will automatically download the E5 model if it isn't already downloaded and then deploy the model.", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1,\n \"model_id\": \".multilingual-e5-small\" \n }\n}" + }, + "PutElasticsearchRequestExample4": { + "summary": "Eland text embedding task", + "description": "Run `PUT _inference/text_embedding/my-msmarco-minilm-model` to create an inference endpoint that performs a `text_embedding` task with a model that was uploaded by Eland.", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1,\n \"model_id\": \"msmarco-MiniLM-L12-cos-v5\" \n }\n}" + }, + "PutElasticsearchRequestExample5": { + "summary": "Adaptive allocation", + "description": "Run `PUT _inference/text_embedding/my-e5-model` to create an inference endpoint that performs a `text_embedding` task and to configure adaptive allocations. The API request will automatically download the E5 model if it isn't already downloaded and then deploy the model.", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"adaptive_allocations\": {\n \"enabled\": true,\n \"min_number_of_allocations\": 3,\n \"max_number_of_allocations\": 10\n },\n \"num_threads\": 1,\n \"model_id\": \".multilingual-e5-small\"\n }\n}" + }, + "PutElasticsearchRequestExample6": { + "summary": "Existing model deployment", + "description": "Run `PUT _inference/sparse_embedding/use_existing_deployment` to use an already existing model deployment when creating an inference endpoint.", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"deployment_id\": \".elser_model_2\"\n }\n}" + } + } + } + } + }, + "responses": { + "200": { + "description": "", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/inference._types:InferenceEndpointInfo" + }, + "examples": { + "PutElasticsearchResponseExample1": { + "description": "A successful response from `PUT _inference/sparse_embedding/use_existing_deployment`. It contains the model ID and the threads and allocations settings from the model deployment.\n", + "value": "{\n \"inference_id\": \"use_existing_deployment\",\n \"task_type\": \"sparse_embedding\",\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"num_allocations\": 2,\n \"num_threads\": 1,\n \"model_id\": \".elser_model_2\",\n \"deployment_id\": \".elser_model_2\"\n },\n \"chunking_settings\": {\n \"strategy\": \"sentence\",\n \"max_chunk_size\": 250,\n \"sentence_overlap\": 1\n }\n}" + } + } + } + } + } + }, + "x-state": "Added in 8.13.0" + } + }, "/_inference/{task_type}/{elser_inference_id}": { "put": { "tags": [ @@ -48828,6 +48940,77 @@ "model_id" ] }, + "inference.put_elasticsearch:ElasticsearchTaskType": { + "type": "string", + "enum": [ + "rerank", + "sparse_embedding", + "text_embedding" + ] + }, + "inference.put_elasticsearch:ServiceType": { + "type": "string", + "enum": [ + "elasticsearch" + ] + }, + "inference.put_elasticsearch:ElasticsearchServiceSettings": { + "type": "object", + "properties": { + "adaptive_allocations": { + "$ref": "#/components/schemas/inference.put_elasticsearch:AdaptiveAllocations" + }, + "deployment_id": { + "description": "The deployment identifier for a trained model deployment.\nWhen `deployment_id` is used the `model_id` is optional.", + "type": "string" + }, + "model_id": { + "externalDocs": { + "url": "https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-import-model.html#ml-nlp-import-script" + }, + "description": "The name of the model to use for the inference task.\nIt can be the ID of a built-in model (for example, `.multilingual-e5-small` for E5) or a text embedding model that was uploaded by using the Eland client.", + "type": "string" + }, + "num_allocations": { + "description": "The total number of allocations that are assigned to the model across machine learning nodes.\nIncreasing this value generally increases the throughput.\nIf adaptive allocations are enabled, do not set this value because it's automatically set.", + "type": "number" + }, + "num_threads": { + "description": "The number of threads used by each model allocation during inference.\nThis setting generally increases the speed per inference request.\nThe inference process is a compute-bound process; `threads_per_allocations` must not exceed the number of available allocated processors per node.\nThe value must be a power of 2.\nThe maximum value is 32.", + "type": "number" + } + }, + "required": [ + "model_id", + "num_threads" + ] + }, + "inference.put_elasticsearch:AdaptiveAllocations": { + "type": "object", + "properties": { + "enabled": { + "description": "Turn on `adaptive_allocations`.", + "type": "boolean" + }, + "max_number_of_allocations": { + "description": "The maximum number of allocations to scale to.\nIf set, it must be greater than or equal to `min_number_of_allocations`.", + "type": "number" + }, + "min_number_of_allocations": { + "description": "The minimum number of allocations to scale to.\nIf set, it must be greater than or equal to 0.\nIf not defined, the deployment scales to 0.", + "type": "number" + } + } + }, + "inference.put_elasticsearch:ElasticsearchTaskSettings": { + "type": "object", + "properties": { + "return_documents": { + "description": "For a `rerank` task, return the document instead of only the index.", + "type": "boolean" + } + } + }, "inference.put_elser:ElserTaskType": { "type": "string", "enum": [ diff --git a/output/schema/schema-serverless.json b/output/schema/schema-serverless.json index a046a91fb8..cf050f9049 100644 --- a/output/schema/schema-serverless.json +++ b/output/schema/schema-serverless.json @@ -4681,6 +4681,51 @@ } ] }, + { + "availability": { + "serverless": { + "stability": "stable", + "visibility": "public" + }, + "stack": { + "since": "8.13.0", + "stability": "stable", + "visibility": "public" + } + }, + "description": "Create an OpenAI inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `elasticsearch` service.\n\n> info\n> Your Elasticsearch deployment contains preconfigured ELSER and E5 inference endpoints, you only need to create the enpoints using the API if you want to customize the settings.\n\nIf you use the ELSER or the E5 model through the `elasticsearch` service, the API request will automatically download and deploy the model if it isn't downloaded yet.\n\n> info\n> You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value.\n\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "docId": "inference-api-put-elasticsearch", + "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-elasticsearch.html", + "name": "inference.put_elasticsearch", + "privileges": { + "cluster": [ + "manage_inference" + ] + }, + "request": { + "name": "Request", + "namespace": "inference.put_elasticsearch" + }, + "requestBodyRequired": false, + "requestMediaType": [ + "application/json" + ], + "response": { + "name": "Response", + "namespace": "inference.put_elasticsearch" + }, + "responseMediaType": [ + "application/json" + ], + "urls": [ + { + "methods": [ + "PUT" + ], + "path": "/_inference/{task_type}/{elasticsearch_inference_id}" + } + ] + }, { "availability": { "serverless": { @@ -27515,6 +27560,162 @@ }, "specLocation": "inference/put_eis/PutEisResponse.ts#L22-L24" }, + { + "attachedBehaviors": [ + "CommonQueryParameters" + ], + "body": { + "kind": "properties", + "properties": [ + { + "description": "The chunking configuration object.", + "extDocId": "inference-chunking", + "extDocUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/inference-apis.html#infer-chunking-config", + "name": "chunking_settings", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "InferenceChunkingSettings", + "namespace": "inference._types" + } + } + }, + { + "description": "The type of service supported for the specified task type. In this case, `elasticsearch`.", + "name": "service", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ServiceType", + "namespace": "inference.put_elasticsearch" + } + } + }, + { + "description": "Settings used to install the inference model. These settings are specific to the `elasticsearch` service.", + "name": "service_settings", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ElasticsearchServiceSettings", + "namespace": "inference.put_elasticsearch" + } + } + }, + { + "description": "Settings to configure the inference task.\nThese settings are specific to the task type you specified.", + "name": "task_settings", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "ElasticsearchTaskSettings", + "namespace": "inference.put_elasticsearch" + } + } + } + ] + }, + "description": "Create an OpenAI inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `elasticsearch` service.\n\n> info\n> Your Elasticsearch deployment contains preconfigured ELSER and E5 inference endpoints, you only need to create the enpoints using the API if you want to customize the settings.\n\nIf you use the ELSER or the E5 model through the `elasticsearch` service, the API request will automatically download and deploy the model if it isn't downloaded yet.\n\n> info\n> You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value.\n\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "examples": { + "PutElasticsearchRequestExample1": { + "description": "Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task. The `model_id` must be the ID of one of the built-in ELSER models. The API will automatically download the ELSER model if it isn't already downloaded and then deploy the model.", + "summary": "ELSER sparse embedding task", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"adaptive_allocations\": { \n \"enabled\": true,\n \"min_number_of_allocations\": 1,\n \"max_number_of_allocations\": 4\n },\n \"num_threads\": 1,\n \"model_id\": \".elser_model_2\" \n }\n}" + }, + "PutElasticsearchRequestExample2": { + "description": "Run `PUT _inference/rerank/my-elastic-rerank` to create an inference endpoint that performs a rerank task using the built-in Elastic Rerank cross-encoder model. The `model_id` must be `.rerank-v1`, which is the ID of the built-in Elastic Rerank model. The API will automatically download the Elastic Rerank model if it isn't already downloaded and then deploy the model. Once deployed, the model can be used for semantic re-ranking with a `text_similarity_reranker` retriever.", + "summary": "Elastic rerank task", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"model_id\": \".rerank-v1\", \n \"num_threads\": 1,\n \"adaptive_allocations\": { \n \"enabled\": true,\n \"min_number_of_allocations\": 1,\n \"max_number_of_allocations\": 4\n }\n }\n}" + }, + "PutElasticsearchRequestExample3": { + "description": "Run `PUT _inference/text_embedding/my-e5-model` to create an inference endpoint that performs a `text_embedding` task. The `model_id` must be the ID of one of the built-in E5 models. The API will automatically download the E5 model if it isn't already downloaded and then deploy the model.", + "summary": "E5 text embedding task", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1,\n \"model_id\": \".multilingual-e5-small\" \n }\n}" + }, + "PutElasticsearchRequestExample4": { + "description": "Run `PUT _inference/text_embedding/my-msmarco-minilm-model` to create an inference endpoint that performs a `text_embedding` task with a model that was uploaded by Eland.", + "summary": "Eland text embedding task", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1,\n \"model_id\": \"msmarco-MiniLM-L12-cos-v5\" \n }\n}" + }, + "PutElasticsearchRequestExample5": { + "description": "Run `PUT _inference/text_embedding/my-e5-model` to create an inference endpoint that performs a `text_embedding` task and to configure adaptive allocations. The API request will automatically download the E5 model if it isn't already downloaded and then deploy the model.", + "summary": "Adaptive allocation", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"adaptive_allocations\": {\n \"enabled\": true,\n \"min_number_of_allocations\": 3,\n \"max_number_of_allocations\": 10\n },\n \"num_threads\": 1,\n \"model_id\": \".multilingual-e5-small\"\n }\n}" + }, + "PutElasticsearchRequestExample6": { + "description": "Run `PUT _inference/sparse_embedding/use_existing_deployment` to use an already existing model deployment when creating an inference endpoint.", + "summary": "Existing model deployment", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"deployment_id\": \".elser_model_2\"\n }\n}" + } + }, + "inherits": { + "type": { + "name": "RequestBase", + "namespace": "_types" + } + }, + "kind": "request", + "name": { + "name": "Request", + "namespace": "inference.put_elasticsearch" + }, + "path": [ + { + "description": "The type of the inference task that the model will perform.", + "name": "task_type", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ElasticsearchTaskType", + "namespace": "inference.put_elasticsearch" + } + } + }, + { + "description": "The unique identifier of the inference endpoint.\nThe must not match the `model_id`.", + "name": "elasticsearch_inference_id", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "Id", + "namespace": "_types" + } + } + } + ], + "query": [], + "specLocation": "inference/put_elasticsearch/PutElasticsearchRequest.ts#L25-L86" + }, + { + "body": { + "kind": "value", + "value": { + "kind": "instance_of", + "type": { + "name": "InferenceEndpointInfo", + "namespace": "inference._types" + } + } + }, + "examples": { + "PutElasticsearchResponseExample1": { + "description": "A successful response from `PUT _inference/sparse_embedding/use_existing_deployment`. It contains the model ID and the threads and allocations settings from the model deployment.\n", + "value": "{\n \"inference_id\": \"use_existing_deployment\",\n \"task_type\": \"sparse_embedding\",\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"num_allocations\": 2,\n \"num_threads\": 1,\n \"model_id\": \".elser_model_2\",\n \"deployment_id\": \".elser_model_2\"\n },\n \"chunking_settings\": {\n \"strategy\": \"sentence\",\n \"max_chunk_size\": 250,\n \"sentence_overlap\": 1\n }\n}" + } + }, + "kind": "response", + "name": { + "name": "Response", + "namespace": "inference.put_elasticsearch" + }, + "specLocation": "inference/put_elasticsearch/PutElasticsearchResponse.ts#L22-L24" + }, { "attachedBehaviors": [ "CommonQueryParameters" @@ -101754,6 +101955,38 @@ }, "specLocation": "inference/put_eis/PutEisRequest.ts#L68-L70" }, + { + "kind": "enum", + "members": [ + { + "name": "rerank" + }, + { + "name": "sparse_embedding" + }, + { + "name": "text_embedding" + } + ], + "name": { + "name": "ElasticsearchTaskType", + "namespace": "inference.put_elasticsearch" + }, + "specLocation": "inference/put_elasticsearch/PutElasticsearchRequest.ts#L88-L92" + }, + { + "kind": "enum", + "members": [ + { + "name": "elasticsearch" + } + ], + "name": { + "name": "ServiceType", + "namespace": "inference.put_elasticsearch" + }, + "specLocation": "inference/put_elasticsearch/PutElasticsearchRequest.ts#L94-L96" + }, { "kind": "enum", "members": [ @@ -122591,6 +122824,148 @@ ], "specLocation": "inference/put_eis/PutEisRequest.ts#L72-L82" }, + { + "kind": "interface", + "name": { + "name": "ElasticsearchServiceSettings", + "namespace": "inference.put_elasticsearch" + }, + "properties": [ + { + "description": "Adaptive allocations configuration details.\nIf `enabled` is true, the number of allocations of the model is set based on the current load the process gets.\nWhen the load is high, a new model allocation is automatically created, respecting the value of `max_number_of_allocations` if it's set.\nWhen the load is low, a model allocation is automatically removed, respecting the value of `min_number_of_allocations` if it's set.\nIf `enabled` is true, do not set the number of allocations manually.", + "name": "adaptive_allocations", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "AdaptiveAllocations", + "namespace": "inference.put_elasticsearch" + } + } + }, + { + "description": "The deployment identifier for a trained model deployment.\nWhen `deployment_id` is used the `model_id` is optional.", + "name": "deployment_id", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "string", + "namespace": "_builtins" + } + } + }, + { + "description": "The name of the model to use for the inference task.\nIt can be the ID of a built-in model (for example, `.multilingual-e5-small` for E5) or a text embedding model that was uploaded by using the Eland client.", + "extDocId": "eland-import", + "extDocUrl": "https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-import-model.html#ml-nlp-import-script", + "name": "model_id", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "string", + "namespace": "_builtins" + } + } + }, + { + "description": "The total number of allocations that are assigned to the model across machine learning nodes.\nIncreasing this value generally increases the throughput.\nIf adaptive allocations are enabled, do not set this value because it's automatically set.", + "name": "num_allocations", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + }, + { + "description": "The number of threads used by each model allocation during inference.\nThis setting generally increases the speed per inference request.\nThe inference process is a compute-bound process; `threads_per_allocations` must not exceed the number of available allocated processors per node.\nThe value must be a power of 2.\nThe maximum value is 32.", + "name": "num_threads", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + } + ], + "specLocation": "inference/put_elasticsearch/PutElasticsearchRequest.ts#L117-L151" + }, + { + "kind": "interface", + "name": { + "name": "AdaptiveAllocations", + "namespace": "inference.put_elasticsearch" + }, + "properties": [ + { + "description": "Turn on `adaptive_allocations`.", + "name": "enabled", + "required": false, + "serverDefault": false, + "type": { + "kind": "instance_of", + "type": { + "name": "boolean", + "namespace": "_builtins" + } + } + }, + { + "description": "The maximum number of allocations to scale to.\nIf set, it must be greater than or equal to `min_number_of_allocations`.", + "name": "max_number_of_allocations", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + }, + { + "description": "The minimum number of allocations to scale to.\nIf set, it must be greater than or equal to 0.\nIf not defined, the deployment scales to 0.", + "name": "min_number_of_allocations", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + } + ], + "specLocation": "inference/put_elasticsearch/PutElasticsearchRequest.ts#L98-L115" + }, + { + "kind": "interface", + "name": { + "name": "ElasticsearchTaskSettings", + "namespace": "inference.put_elasticsearch" + }, + "properties": [ + { + "description": "For a `rerank` task, return the document instead of only the index.", + "name": "return_documents", + "required": false, + "serverDefault": true, + "type": { + "kind": "instance_of", + "type": { + "name": "boolean", + "namespace": "_builtins" + } + } + } + ], + "specLocation": "inference/put_elasticsearch/PutElasticsearchRequest.ts#L153-L159" + }, { "kind": "interface", "name": { diff --git a/output/schema/schema.json b/output/schema/schema.json index 9f03cdfe98..efe8902d41 100644 --- a/output/schema/schema.json +++ b/output/schema/schema.json @@ -9393,6 +9393,51 @@ } ] }, + { + "availability": { + "serverless": { + "stability": "stable", + "visibility": "public" + }, + "stack": { + "since": "8.13.0", + "stability": "stable", + "visibility": "public" + } + }, + "description": "Create an OpenAI inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `elasticsearch` service.\n\n> info\n> Your Elasticsearch deployment contains preconfigured ELSER and E5 inference endpoints, you only need to create the enpoints using the API if you want to customize the settings.\n\nIf you use the ELSER or the E5 model through the `elasticsearch` service, the API request will automatically download and deploy the model if it isn't downloaded yet.\n\n> info\n> You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value.\n\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "docId": "inference-api-put-elasticsearch", + "docUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-elasticsearch.html", + "name": "inference.put_elasticsearch", + "privileges": { + "cluster": [ + "manage_inference" + ] + }, + "request": { + "name": "Request", + "namespace": "inference.put_elasticsearch" + }, + "requestBodyRequired": false, + "requestMediaType": [ + "application/json" + ], + "response": { + "name": "Response", + "namespace": "inference.put_elasticsearch" + }, + "responseMediaType": [ + "application/json" + ], + "urls": [ + { + "methods": [ + "PUT" + ], + "path": "/_inference/{task_type}/{elasticsearch_inference_id}" + } + ] + }, { "availability": { "serverless": { @@ -151262,6 +151307,336 @@ }, "specLocation": "inference/put_eis/PutEisRequest.ts#L68-L70" }, + { + "kind": "interface", + "name": { + "name": "AdaptiveAllocations", + "namespace": "inference.put_elasticsearch" + }, + "properties": [ + { + "description": "Turn on `adaptive_allocations`.", + "name": "enabled", + "required": false, + "serverDefault": false, + "type": { + "kind": "instance_of", + "type": { + "name": "boolean", + "namespace": "_builtins" + } + } + }, + { + "description": "The maximum number of allocations to scale to.\nIf set, it must be greater than or equal to `min_number_of_allocations`.", + "name": "max_number_of_allocations", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + }, + { + "description": "The minimum number of allocations to scale to.\nIf set, it must be greater than or equal to 0.\nIf not defined, the deployment scales to 0.", + "name": "min_number_of_allocations", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + } + ], + "specLocation": "inference/put_elasticsearch/PutElasticsearchRequest.ts#L98-L115" + }, + { + "kind": "interface", + "name": { + "name": "ElasticsearchServiceSettings", + "namespace": "inference.put_elasticsearch" + }, + "properties": [ + { + "description": "Adaptive allocations configuration details.\nIf `enabled` is true, the number of allocations of the model is set based on the current load the process gets.\nWhen the load is high, a new model allocation is automatically created, respecting the value of `max_number_of_allocations` if it's set.\nWhen the load is low, a model allocation is automatically removed, respecting the value of `min_number_of_allocations` if it's set.\nIf `enabled` is true, do not set the number of allocations manually.", + "name": "adaptive_allocations", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "AdaptiveAllocations", + "namespace": "inference.put_elasticsearch" + } + } + }, + { + "description": "The deployment identifier for a trained model deployment.\nWhen `deployment_id` is used the `model_id` is optional.", + "name": "deployment_id", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "string", + "namespace": "_builtins" + } + } + }, + { + "description": "The name of the model to use for the inference task.\nIt can be the ID of a built-in model (for example, `.multilingual-e5-small` for E5) or a text embedding model that was uploaded by using the Eland client.", + "extDocId": "eland-import", + "extDocUrl": "https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-import-model.html#ml-nlp-import-script", + "name": "model_id", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "string", + "namespace": "_builtins" + } + } + }, + { + "description": "The total number of allocations that are assigned to the model across machine learning nodes.\nIncreasing this value generally increases the throughput.\nIf adaptive allocations are enabled, do not set this value because it's automatically set.", + "name": "num_allocations", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + }, + { + "description": "The number of threads used by each model allocation during inference.\nThis setting generally increases the speed per inference request.\nThe inference process is a compute-bound process; `threads_per_allocations` must not exceed the number of available allocated processors per node.\nThe value must be a power of 2.\nThe maximum value is 32.", + "name": "num_threads", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "integer", + "namespace": "_types" + } + } + } + ], + "specLocation": "inference/put_elasticsearch/PutElasticsearchRequest.ts#L117-L151" + }, + { + "kind": "interface", + "name": { + "name": "ElasticsearchTaskSettings", + "namespace": "inference.put_elasticsearch" + }, + "properties": [ + { + "description": "For a `rerank` task, return the document instead of only the index.", + "name": "return_documents", + "required": false, + "serverDefault": true, + "type": { + "kind": "instance_of", + "type": { + "name": "boolean", + "namespace": "_builtins" + } + } + } + ], + "specLocation": "inference/put_elasticsearch/PutElasticsearchRequest.ts#L153-L159" + }, + { + "kind": "enum", + "members": [ + { + "name": "rerank" + }, + { + "name": "sparse_embedding" + }, + { + "name": "text_embedding" + } + ], + "name": { + "name": "ElasticsearchTaskType", + "namespace": "inference.put_elasticsearch" + }, + "specLocation": "inference/put_elasticsearch/PutElasticsearchRequest.ts#L88-L92" + }, + { + "kind": "request", + "attachedBehaviors": [ + "CommonQueryParameters" + ], + "body": { + "kind": "properties", + "properties": [ + { + "description": "The chunking configuration object.", + "extDocId": "inference-chunking", + "extDocUrl": "https://www.elastic.co/guide/en/elasticsearch/reference/current/inference-apis.html#infer-chunking-config", + "name": "chunking_settings", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "InferenceChunkingSettings", + "namespace": "inference._types" + } + } + }, + { + "description": "The type of service supported for the specified task type. In this case, `elasticsearch`.", + "name": "service", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ServiceType", + "namespace": "inference.put_elasticsearch" + } + } + }, + { + "description": "Settings used to install the inference model. These settings are specific to the `elasticsearch` service.", + "name": "service_settings", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ElasticsearchServiceSettings", + "namespace": "inference.put_elasticsearch" + } + } + }, + { + "description": "Settings to configure the inference task.\nThese settings are specific to the task type you specified.", + "name": "task_settings", + "required": false, + "type": { + "kind": "instance_of", + "type": { + "name": "ElasticsearchTaskSettings", + "namespace": "inference.put_elasticsearch" + } + } + } + ] + }, + "description": "Create an OpenAI inference endpoint.\n\nCreate an inference endpoint to perform an inference task with the `elasticsearch` service.\n\n> info\n> Your Elasticsearch deployment contains preconfigured ELSER and E5 inference endpoints, you only need to create the enpoints using the API if you want to customize the settings.\n\nIf you use the ELSER or the E5 model through the `elasticsearch` service, the API request will automatically download and deploy the model if it isn't downloaded yet.\n\n> info\n> You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value.\n\nAfter creating the endpoint, wait for the model deployment to complete before using it.\nTo verify the deployment status, use the get trained model statistics API.\nLook for `\"state\": \"fully_allocated\"` in the response and ensure that the `\"allocation_count\"` matches the `\"target_allocation_count\"`.\nAvoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources.", + "examples": { + "PutElasticsearchRequestExample1": { + "description": "Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task. The `model_id` must be the ID of one of the built-in ELSER models. The API will automatically download the ELSER model if it isn't already downloaded and then deploy the model.", + "summary": "ELSER sparse embedding task", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"adaptive_allocations\": { \n \"enabled\": true,\n \"min_number_of_allocations\": 1,\n \"max_number_of_allocations\": 4\n },\n \"num_threads\": 1,\n \"model_id\": \".elser_model_2\" \n }\n}" + }, + "PutElasticsearchRequestExample2": { + "description": "Run `PUT _inference/rerank/my-elastic-rerank` to create an inference endpoint that performs a rerank task using the built-in Elastic Rerank cross-encoder model. The `model_id` must be `.rerank-v1`, which is the ID of the built-in Elastic Rerank model. The API will automatically download the Elastic Rerank model if it isn't already downloaded and then deploy the model. Once deployed, the model can be used for semantic re-ranking with a `text_similarity_reranker` retriever.", + "summary": "Elastic rerank task", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"model_id\": \".rerank-v1\", \n \"num_threads\": 1,\n \"adaptive_allocations\": { \n \"enabled\": true,\n \"min_number_of_allocations\": 1,\n \"max_number_of_allocations\": 4\n }\n }\n}" + }, + "PutElasticsearchRequestExample3": { + "description": "Run `PUT _inference/text_embedding/my-e5-model` to create an inference endpoint that performs a `text_embedding` task. The `model_id` must be the ID of one of the built-in E5 models. The API will automatically download the E5 model if it isn't already downloaded and then deploy the model.", + "summary": "E5 text embedding task", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1,\n \"model_id\": \".multilingual-e5-small\" \n }\n}" + }, + "PutElasticsearchRequestExample4": { + "description": "Run `PUT _inference/text_embedding/my-msmarco-minilm-model` to create an inference endpoint that performs a `text_embedding` task with a model that was uploaded by Eland.", + "summary": "Eland text embedding task", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"num_allocations\": 1,\n \"num_threads\": 1,\n \"model_id\": \"msmarco-MiniLM-L12-cos-v5\" \n }\n}" + }, + "PutElasticsearchRequestExample5": { + "description": "Run `PUT _inference/text_embedding/my-e5-model` to create an inference endpoint that performs a `text_embedding` task and to configure adaptive allocations. The API request will automatically download the E5 model if it isn't already downloaded and then deploy the model.", + "summary": "Adaptive allocation", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"adaptive_allocations\": {\n \"enabled\": true,\n \"min_number_of_allocations\": 3,\n \"max_number_of_allocations\": 10\n },\n \"num_threads\": 1,\n \"model_id\": \".multilingual-e5-small\"\n }\n}" + }, + "PutElasticsearchRequestExample6": { + "description": "Run `PUT _inference/sparse_embedding/use_existing_deployment` to use an already existing model deployment when creating an inference endpoint.", + "summary": "Existing model deployment", + "value": "{\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"deployment_id\": \".elser_model_2\"\n }\n}" + } + }, + "inherits": { + "type": { + "name": "RequestBase", + "namespace": "_types" + } + }, + "name": { + "name": "Request", + "namespace": "inference.put_elasticsearch" + }, + "path": [ + { + "description": "The type of the inference task that the model will perform.", + "name": "task_type", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "ElasticsearchTaskType", + "namespace": "inference.put_elasticsearch" + } + } + }, + { + "description": "The unique identifier of the inference endpoint.\nThe must not match the `model_id`.", + "name": "elasticsearch_inference_id", + "required": true, + "type": { + "kind": "instance_of", + "type": { + "name": "Id", + "namespace": "_types" + } + } + } + ], + "query": [], + "specLocation": "inference/put_elasticsearch/PutElasticsearchRequest.ts#L25-L86" + }, + { + "kind": "response", + "body": { + "kind": "value", + "value": { + "kind": "instance_of", + "type": { + "name": "InferenceEndpointInfo", + "namespace": "inference._types" + } + } + }, + "examples": { + "PutElasticsearchResponseExample1": { + "description": "A successful response from `PUT _inference/sparse_embedding/use_existing_deployment`. It contains the model ID and the threads and allocations settings from the model deployment.\n", + "value": "{\n \"inference_id\": \"use_existing_deployment\",\n \"task_type\": \"sparse_embedding\",\n \"service\": \"elasticsearch\",\n \"service_settings\": {\n \"num_allocations\": 2,\n \"num_threads\": 1,\n \"model_id\": \".elser_model_2\",\n \"deployment_id\": \".elser_model_2\"\n },\n \"chunking_settings\": {\n \"strategy\": \"sentence\",\n \"max_chunk_size\": 250,\n \"sentence_overlap\": 1\n }\n}" + } + }, + "name": { + "name": "Response", + "namespace": "inference.put_elasticsearch" + }, + "specLocation": "inference/put_elasticsearch/PutElasticsearchResponse.ts#L22-L24" + }, + { + "kind": "enum", + "members": [ + { + "name": "elasticsearch" + } + ], + "name": { + "name": "ServiceType", + "namespace": "inference.put_elasticsearch" + }, + "specLocation": "inference/put_elasticsearch/PutElasticsearchRequest.ts#L94-L96" + }, { "kind": "interface", "name": { diff --git a/output/typescript/types.ts b/output/typescript/types.ts index cdcc2fa037..07af9d9e72 100644 --- a/output/typescript/types.ts +++ b/output/typescript/types.ts @@ -13312,6 +13312,41 @@ export type InferencePutEisResponse = InferenceInferenceEndpointInfo export type InferencePutEisServiceType = 'elastic' +export interface InferencePutElasticsearchAdaptiveAllocations { + enabled?: boolean + max_number_of_allocations?: integer + min_number_of_allocations?: integer +} + +export interface InferencePutElasticsearchElasticsearchServiceSettings { + adaptive_allocations?: InferencePutElasticsearchAdaptiveAllocations + deployment_id?: string + model_id: string + num_allocations?: integer + num_threads: integer +} + +export interface InferencePutElasticsearchElasticsearchTaskSettings { + return_documents?: boolean +} + +export type InferencePutElasticsearchElasticsearchTaskType = 'rerank' | 'sparse_embedding' | 'text_embedding' + +export interface InferencePutElasticsearchRequest extends RequestBase { + task_type: InferencePutElasticsearchElasticsearchTaskType + elasticsearch_inference_id: Id + body?: { + chunking_settings?: InferenceInferenceChunkingSettings + service: InferencePutElasticsearchServiceType + service_settings: InferencePutElasticsearchElasticsearchServiceSettings + task_settings?: InferencePutElasticsearchElasticsearchTaskSettings + } +} + +export type InferencePutElasticsearchResponse = InferenceInferenceEndpointInfo + +export type InferencePutElasticsearchServiceType = 'elasticsearch' + export interface InferencePutElserAdaptiveAllocations { enabled?: boolean max_number_of_allocations?: integer diff --git a/specification/_doc_ids/table.csv b/specification/_doc_ids/table.csv index 869457d488..4f3450eb2b 100644 --- a/specification/_doc_ids/table.csv +++ b/specification/_doc_ids/table.csv @@ -179,6 +179,7 @@ document-input-parameters,https://www.elastic.co/guide/en/elasticsearch/referenc docvalue-fields,https://www.elastic.co/guide/en/elasticsearch/reference/current/search-fields.html#docvalue-fields dot-expand-processor,https://www.elastic.co/guide/en/elasticsearch/reference/current/dot-expand-processor.html drop-processor,https://www.elastic.co/guide/en/elasticsearch/reference/current/drop-processor.html +eland-import,https://www.elastic.co/guide/en/machine-learning/current/ml-nlp-import-model.html#ml-nlp-import-script enrich-processor,https://www.elastic.co/guide/en/elasticsearch/reference/current/enrich-processor.html enrich-stats-api,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-enrich-stats eql-async-search-api,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-eql-get @@ -326,6 +327,7 @@ inference-api-post-eis-chat-completion,https://www.elastic.co/docs/api/doc/elast inference-api-put,https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put inference-api-put-cohere,https://www.elastic.co/guide/en/elasticsearch/reference/branch/infer-service-cohere.html inference-api-put-eis,https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-eis.html +inference-api-put-elasticsearch,https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-elasticsearch.html inference-api-put-elser,https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-elser.html inference-api-put-huggingface,https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-hugging-face.html inference-api-put-jinaai,https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-jinaai.html diff --git a/specification/_json_spec/inference.put_elasticsearch.json b/specification/_json_spec/inference.put_elasticsearch.json new file mode 100644 index 0000000000..0c326b79e9 --- /dev/null +++ b/specification/_json_spec/inference.put_elasticsearch.json @@ -0,0 +1,35 @@ +{ + "inference.put_elasticsearch": { + "documentation": { + "url": "https://www.elastic.co/guide/en/elasticsearch/reference/current/infer-service-elasticsearch.html", + "description": "Configure an Elasticsearch inference endpoint" + }, + "stability": "stable", + "visibility": "public", + "headers": { + "accept": ["application/json"], + "content_type": ["application/json"] + }, + "url": { + "paths": [ + { + "path": "/_inference/{task_type}/{elasticsearch_inference_id}", + "methods": ["PUT"], + "parts": { + "task_type": { + "type": "string", + "description": "The task type" + }, + "elasticsearch_inference_id": { + "type": "string", + "description": "The inference Id" + } + } + } + ] + }, + "body": { + "description": "The inference endpoint's task and service settings" + } + } +} diff --git a/specification/inference/put_elasticsearch/PutElasticsearchRequest.ts b/specification/inference/put_elasticsearch/PutElasticsearchRequest.ts new file mode 100644 index 0000000000..a863e8ba45 --- /dev/null +++ b/specification/inference/put_elasticsearch/PutElasticsearchRequest.ts @@ -0,0 +1,159 @@ +/* + * Licensed to Elasticsearch B.V. under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch B.V. licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { InferenceChunkingSettings } from '@inference/_types/Services' +import { RequestBase } from '@_types/Base' +import { Id } from '@_types/common' +import { integer } from '@_types/Numeric' + +/** + * Create an OpenAI inference endpoint. + * + * Create an inference endpoint to perform an inference task with the `elasticsearch` service. + * + * > info + * > Your Elasticsearch deployment contains preconfigured ELSER and E5 inference endpoints, you only need to create the enpoints using the API if you want to customize the settings. + * + * If you use the ELSER or the E5 model through the `elasticsearch` service, the API request will automatically download and deploy the model if it isn't downloaded yet. + * + * > info + * > You might see a 502 bad gateway error in the response when using the Kibana Console. This error usually just reflects a timeout, while the model downloads in the background. You can check the download progress in the Machine Learning UI. If using the Python client, you can set the timeout parameter to a higher value. + * + * After creating the endpoint, wait for the model deployment to complete before using it. + * To verify the deployment status, use the get trained model statistics API. + * Look for `"state": "fully_allocated"` in the response and ensure that the `"allocation_count"` matches the `"target_allocation_count"`. + * Avoid creating multiple endpoints for the same model unless required, as each endpoint consumes significant resources. + * @rest_spec_name inference.put_elasticsearch + * @availability stack since=8.13.0 stability=stable visibility=public + * @availability serverless stability=stable visibility=public + * @cluster_privileges manage_inference + * @doc_id inference-api-put-elasticsearch + */ +export interface Request extends RequestBase { + urls: [ + { + path: '/_inference/{task_type}/{elasticsearch_inference_id}' + methods: ['PUT'] + } + ] + path_parts: { + /** + * The type of the inference task that the model will perform. + */ + task_type: ElasticsearchTaskType + /** + * The unique identifier of the inference endpoint. + * The must not match the `model_id`. + */ + elasticsearch_inference_id: Id + } + body: { + /** + * The chunking configuration object. + * @ext_doc_id inference-chunking + */ + chunking_settings?: InferenceChunkingSettings + /** + * The type of service supported for the specified task type. In this case, `elasticsearch`. + */ + service: ServiceType + /** + * Settings used to install the inference model. These settings are specific to the `elasticsearch` service. + */ + service_settings: ElasticsearchServiceSettings + /** + * Settings to configure the inference task. + * These settings are specific to the task type you specified. + */ + task_settings?: ElasticsearchTaskSettings + } +} + +export enum ElasticsearchTaskType { + rerank, + sparse_embedding, + text_embedding +} + +export enum ServiceType { + elasticsearch +} + +export class AdaptiveAllocations { + /** + * Turn on `adaptive_allocations`. + * @server_default false + */ + enabled?: boolean + /** + * The maximum number of allocations to scale to. + * If set, it must be greater than or equal to `min_number_of_allocations`. + */ + max_number_of_allocations?: integer + /** + * The minimum number of allocations to scale to. + * If set, it must be greater than or equal to 0. + * If not defined, the deployment scales to 0. + */ + min_number_of_allocations?: integer +} + +export class ElasticsearchServiceSettings { + /** + * Adaptive allocations configuration details. + * If `enabled` is true, the number of allocations of the model is set based on the current load the process gets. + * When the load is high, a new model allocation is automatically created, respecting the value of `max_number_of_allocations` if it's set. + * When the load is low, a model allocation is automatically removed, respecting the value of `min_number_of_allocations` if it's set. + * If `enabled` is true, do not set the number of allocations manually. + */ + adaptive_allocations?: AdaptiveAllocations + /** + * The deployment identifier for a trained model deployment. + * When `deployment_id` is used the `model_id` is optional. + */ + deployment_id?: string + /** + * The name of the model to use for the inference task. + * It can be the ID of a built-in model (for example, `.multilingual-e5-small` for E5) or a text embedding model that was uploaded by using the Eland client. + * @ext_doc_id eland-import + */ + model_id: string + /** + * The total number of allocations that are assigned to the model across machine learning nodes. + * Increasing this value generally increases the throughput. + * If adaptive allocations are enabled, do not set this value because it's automatically set. + */ + num_allocations?: integer + /** + * The number of threads used by each model allocation during inference. + * This setting generally increases the speed per inference request. + * The inference process is a compute-bound process; `threads_per_allocations` must not exceed the number of available allocated processors per node. + * The value must be a power of 2. + * The maximum value is 32. + */ + num_threads: integer +} + +export class ElasticsearchTaskSettings { + /** + * For a `rerank` task, return the document instead of only the index. + * @server_default true + */ + return_documents?: boolean +} diff --git a/specification/inference/put_elasticsearch/PutElasticsearchResponse.ts b/specification/inference/put_elasticsearch/PutElasticsearchResponse.ts new file mode 100644 index 0000000000..d40639b031 --- /dev/null +++ b/specification/inference/put_elasticsearch/PutElasticsearchResponse.ts @@ -0,0 +1,24 @@ +/* + * Licensed to Elasticsearch B.V. under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch B.V. licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import { InferenceEndpointInfo } from '@inference/_types/Services' + +export class Response { + body: InferenceEndpointInfo +} diff --git a/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample1.yaml b/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample1.yaml new file mode 100644 index 0000000000..e84111179b --- /dev/null +++ b/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample1.yaml @@ -0,0 +1,17 @@ +summary: ELSER sparse embedding task +description: Run `PUT _inference/sparse_embedding/my-elser-model` to create an inference endpoint that performs a `sparse_embedding` task. The `model_id` must be the ID of one of the built-in ELSER models. The API will automatically download the ELSER model if it isn't already downloaded and then deploy the model. +# method_request: "PUT _inference/sparse_embedding/my-elser-model" +# type: "request" +value: |- + { + "service": "elasticsearch", + "service_settings": { + "adaptive_allocations": { + "enabled": true, + "min_number_of_allocations": 1, + "max_number_of_allocations": 4 + }, + "num_threads": 1, + "model_id": ".elser_model_2" + } + } diff --git a/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample2.yaml b/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample2.yaml new file mode 100644 index 0000000000..751da766e6 --- /dev/null +++ b/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample2.yaml @@ -0,0 +1,17 @@ +summary: Elastic rerank task +description: Run `PUT _inference/rerank/my-elastic-rerank` to create an inference endpoint that performs a rerank task using the built-in Elastic Rerank cross-encoder model. The `model_id` must be `.rerank-v1`, which is the ID of the built-in Elastic Rerank model. The API will automatically download the Elastic Rerank model if it isn't already downloaded and then deploy the model. Once deployed, the model can be used for semantic re-ranking with a `text_similarity_reranker` retriever. +# method_request: "PUT _inference/rerank/my-elastic-rerank" +# type: "request" +value: |- + { + "service": "elasticsearch", + "service_settings": { + "model_id": ".rerank-v1", + "num_threads": 1, + "adaptive_allocations": { + "enabled": true, + "min_number_of_allocations": 1, + "max_number_of_allocations": 4 + } + } + } diff --git a/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample3.yaml b/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample3.yaml new file mode 100644 index 0000000000..8f1220ffb9 --- /dev/null +++ b/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample3.yaml @@ -0,0 +1,13 @@ +summary: E5 text embedding task +description: Run `PUT _inference/text_embedding/my-e5-model` to create an inference endpoint that performs a `text_embedding` task. The `model_id` must be the ID of one of the built-in E5 models. The API will automatically download the E5 model if it isn't already downloaded and then deploy the model. +# method_request: "PUT _inference/text_embedding/my-e5-model" +# type: "request" +value: |- + { + "service": "elasticsearch", + "service_settings": { + "num_allocations": 1, + "num_threads": 1, + "model_id": ".multilingual-e5-small" + } + } diff --git a/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample4.yaml b/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample4.yaml new file mode 100644 index 0000000000..c9aa5d1f49 --- /dev/null +++ b/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample4.yaml @@ -0,0 +1,13 @@ +summary: Eland text embedding task +description: Run `PUT _inference/text_embedding/my-msmarco-minilm-model` to create an inference endpoint that performs a `text_embedding` task with a model that was uploaded by Eland. +# method_request: "PUT _inference/text_embedding/my-msmarco-minilm-model" +# type: "request" +value: |- + { + "service": "elasticsearch", + "service_settings": { + "num_allocations": 1, + "num_threads": 1, + "model_id": "msmarco-MiniLM-L12-cos-v5" + } + } diff --git a/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample5.yaml b/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample5.yaml new file mode 100644 index 0000000000..83be34fe27 --- /dev/null +++ b/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample5.yaml @@ -0,0 +1,17 @@ +summary: Adaptive allocation +description: Run `PUT _inference/text_embedding/my-e5-model` to create an inference endpoint that performs a `text_embedding` task and to configure adaptive allocations. The API request will automatically download the E5 model if it isn't already downloaded and then deploy the model. +# method_request: "PUT _inference/text_embedding/my-e5-model" +# type: "request" +value: |- + { + "service": "elasticsearch", + "service_settings": { + "adaptive_allocations": { + "enabled": true, + "min_number_of_allocations": 3, + "max_number_of_allocations": 10 + }, + "num_threads": 1, + "model_id": ".multilingual-e5-small" + } + } diff --git a/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample6.yaml b/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample6.yaml new file mode 100644 index 0000000000..0e7aaa1001 --- /dev/null +++ b/specification/inference/put_elasticsearch/examples/request/PutElasticsearchRequestExample6.yaml @@ -0,0 +1,11 @@ +summary: Existing model deployment +description: Run `PUT _inference/sparse_embedding/use_existing_deployment` to use an already existing model deployment when creating an inference endpoint. +# method_request: "PUT _inference/sparse_embedding/use_existing_deployment" +# type: "request" +value: |- + { + "service": "elasticsearch", + "service_settings": { + "deployment_id": ".elser_model_2" + } + } diff --git a/specification/inference/put_elasticsearch/examples/response/PutElasticsearchResponseExample1.yaml b/specification/inference/put_elasticsearch/examples/response/PutElasticsearchResponseExample1.yaml new file mode 100644 index 0000000000..2a9c7f6033 --- /dev/null +++ b/specification/inference/put_elasticsearch/examples/response/PutElasticsearchResponseExample1.yaml @@ -0,0 +1,22 @@ +# summary: +description: > + A successful response from `PUT _inference/sparse_embedding/use_existing_deployment`. It contains the model ID and the threads and allocations settings from the model deployment. +# type: "response" +# response_code: +value: |- + { + "inference_id": "use_existing_deployment", + "task_type": "sparse_embedding", + "service": "elasticsearch", + "service_settings": { + "num_allocations": 2, + "num_threads": 1, + "model_id": ".elser_model_2", + "deployment_id": ".elser_model_2" + }, + "chunking_settings": { + "strategy": "sentence", + "max_chunk_size": 250, + "sentence_overlap": 1 + } + }