', methods=['POST'])
+def receive_image_generation_by_organization_model(organization, model):
+ return process_image_generation_request(request, f'{organization}/{model}')
+
+def process_completion_request(request, model):
+ request_data = request.data
+ json_data = json.loads(request_data)
+ try:
+ prompt = json_data["inputs"]
+ if "context" in json_data:
+ context = json_data["context"]
+ else:
+ context = ""
+
+ if "max_tokens" in json_data:
+ max_tokens = json_data["max_tokens"]
+ else:
+ max_tokens = 32
+
+ inference_generator = CompletionGenerator.CompletionGenerator(model)
+ result, num_prompt_tokens, num_result_tokens = inference_generator.perform_inference(prompt, context, max_tokens)
+ return jsonify(create_responses.create_completion_response(
+ result,
+ model,
+ num_prompt_tokens,
+ num_result_tokens))
+ except Exception as e:
+ print(e)
+ return ("Sorry, unable to perform sentence completion with model {}".format(model))
+
+def process_embedding_request(request, model):
+ request_data = request.data
+ json_data = json.loads(request_data)
+ try:
+ sentences = json_data["inputs"]
+ inference_generator = EmbeddingGenerator.EmbeddingGenerator(model)
+ embeddings, num_prompt_tokens = inference_generator.perform_inference(sentences)
+ return jsonify(create_responses.create_embedding_response(
+ embeddings,
+ num_prompt_tokens))
+ except Exception as e:
+ print(e)
+ return ("Sorry, unable to generate embeddings with model {}".format(model))
+
+def process_image_generation_request(request, model):
+ request_data = request.data
+ json_data = json.loads(request_data)
+ num_images = json_data["n"]
+ prompt = json_data["inputs"]
+ image_size = json_data["size"]
+ try:
+ image_generator = ImageGenerator.ImageGenerator(model)
+ image_data = image_generator.perform_inference(prompt, num_images, image_size)
+ return jsonify(create_responses.create_image_gen_response(image_data))
+ except Exception as e:
+ print(e)
+ return ("Sorry, unable to generate images with model {}".format(model))
+
+# main driver function
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument('-i', '--ip',
+ default='0.0.0.0',
+ help='ip address for flask server endpoint'
+ )
+ parser.add_argument('-p', '--port',
+ default=5000,
+ help='port for flask server endpoint',
+ type=int,
+ )
+ args = parser.parse_args()
+
+ host_ip = args.ip
+ port = args.port
+
+ app.run(host=host_ip, debug=True, port=port)
diff --git a/samples/apps/hugging-face-http-server/requirements.txt b/samples/apps/hugging-face-http-server/requirements.txt
new file mode 100644
index 000000000000..ac5b62201ab9
Binary files /dev/null and b/samples/apps/hugging-face-http-server/requirements.txt differ
diff --git a/samples/apps/hugging-face-http-server/static/css/styles.css b/samples/apps/hugging-face-http-server/static/css/styles.css
new file mode 100644
index 000000000000..eea906c30f2d
--- /dev/null
+++ b/samples/apps/hugging-face-http-server/static/css/styles.css
@@ -0,0 +1,11 @@
+/* Copyright (c) Microsoft. All rights reserved. */
+
+.code {
+ background-color: black;
+ height: auto;
+ width: auto;
+ border-style: solid;
+ font-family:'Courier New', Courier, monospace;
+ color: white;
+ font-size: 14px;
+}
diff --git a/samples/apps/hugging-face-http-server/templates/completions.html b/samples/apps/hugging-face-http-server/templates/completions.html
new file mode 100644
index 000000000000..bf88cc859abd
--- /dev/null
+++ b/samples/apps/hugging-face-http-server/templates/completions.html
@@ -0,0 +1,24 @@
+
+
+{% extends 'documentation.html' %}
+
+{% block body %}
+
+Completions
+
+Example Request
+
+
curl http://localhost:5000/completions/{model} \
+
-X POST \
+
-H "Content-Type: application/json" \
+
-d '{"inputs": "this is a test"}'
+
+
+
+ HF Text Generation Models
+
+
+ HF Text Summarization Models
+
+
+{% endblock %}
diff --git a/samples/apps/hugging-face-http-server/templates/documentation.html b/samples/apps/hugging-face-http-server/templates/documentation.html
new file mode 100644
index 000000000000..84d1384067a0
--- /dev/null
+++ b/samples/apps/hugging-face-http-server/templates/documentation.html
@@ -0,0 +1,27 @@
+
+
+
+
+
+
+
+
+ Flask Docker
+
+
+ API Documentation
+ Home
+ Completions
+ Embeddings
+ Images
+
+ {% block body %}
+
+
+ Documentation
+
+
+ {% endblock %}
+
+
+
\ No newline at end of file
diff --git a/samples/apps/hugging-face-http-server/templates/embeddings.html b/samples/apps/hugging-face-http-server/templates/embeddings.html
new file mode 100644
index 000000000000..15d81cf0d3b4
--- /dev/null
+++ b/samples/apps/hugging-face-http-server/templates/embeddings.html
@@ -0,0 +1,21 @@
+
+
+{% extends 'documentation.html' %}
+
+{% block body %}
+
+Embeddings
+
+Example Request
+
+
curl http://localhost:5000/embeddings/{model} \
+
-X POST \
+
-H "Content-Type: application/json" \
+
-d '{"inputs": ["test string 1", "test string 2", ...]}'
+
+
+
+ HF Text Embedding Models
+
+
+{% endblock %}
diff --git a/samples/apps/hugging-face-http-server/templates/home.html b/samples/apps/hugging-face-http-server/templates/home.html
new file mode 100644
index 000000000000..e71ec5a53878
--- /dev/null
+++ b/samples/apps/hugging-face-http-server/templates/home.html
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+ Flask Docker
+
+
+ Your Hugging Face model server is running
+ Documentation
+
+
\ No newline at end of file
diff --git a/samples/apps/hugging-face-http-server/templates/images.html b/samples/apps/hugging-face-http-server/templates/images.html
new file mode 100644
index 000000000000..eec18e540c35
--- /dev/null
+++ b/samples/apps/hugging-face-http-server/templates/images.html
@@ -0,0 +1,21 @@
+
+
+{% extends 'documentation.html' %}
+
+{% block body %}
+
+Images
+
+Example Request
+
+
curl http://localhost:5000/images/generations/{model} \
+
-X POST \
+
-H "Content-Type: application/json" \
+
-d '{"inputs": "a test image", "n": 1, "size": "256x256"}'
+
+
+
+ HF Text-to-Image Models
+
+
+{% endblock %}
diff --git a/samples/apps/hugging-face-http-server/utils/CompletionGenerator.py b/samples/apps/hugging-face-http-server/utils/CompletionGenerator.py
new file mode 100644
index 000000000000..42369d1a0e0a
--- /dev/null
+++ b/samples/apps/hugging-face-http-server/utils/CompletionGenerator.py
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+from . import InferenceGenerator
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+# The model used to get the tokenizer can be a little arbitrary
+# since the tokenizers are common within the same model type
+
+class CompletionGenerator(InferenceGenerator.InferenceGenerator):
+ def __init__(self, model_name):
+ super().__init__(model_name)
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+ self.tokenizer.pad_token = self.tokenizer.eos_token
+
+ def perform_inference(self, prompt, context, max_tokens):
+ model = AutoModelForCausalLM.from_pretrained(self.model_name, is_decoder=True)
+ model.to(self.device)
+
+ encodings = self.tokenizer.encode_plus(
+ text = prompt,
+ text_pair = context,
+ truncation = True,
+ return_tensors= 'pt')
+
+ generated_ids = model.generate(
+ encodings.input_ids,
+ max_length = max_tokens,
+ # num_beams = 5,
+ # temperature = 0.8,
+ no_repeat_ngram_size=4,
+ early_stopping=True)
+
+ return self.tokenizer.decode(generated_ids[0]), encodings.input_ids.numel(), len(generated_ids[0])
diff --git a/samples/apps/hugging-face-http-server/utils/EmbeddingGenerator.py b/samples/apps/hugging-face-http-server/utils/EmbeddingGenerator.py
new file mode 100644
index 000000000000..47495ca56fe5
--- /dev/null
+++ b/samples/apps/hugging-face-http-server/utils/EmbeddingGenerator.py
@@ -0,0 +1,32 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import torch
+from . import InferenceGenerator
+from transformers import AutoModel, AutoTokenizer
+
+class EmbeddingGenerator(InferenceGenerator.InferenceGenerator):
+ def __init__(self, model_name):
+ super().__init__(model_name)
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+ self.tokenizer.pad_token = self.tokenizer.eos_token
+
+ def _mean_pooling(self, model_output, attention_mask):
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+ input_mask_expanded = attention_mask.unsqueeze(-1).float()
+ x = torch.sum(token_embeddings * input_mask_expanded, 1)
+ y = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+ return (x / y)
+
+ def perform_inference(self, sentences):
+ model = AutoModel.from_pretrained(self.model_name)
+ model.to(self.device)
+
+ encodings = self.tokenizer(
+ sentences,
+ padding = True,
+ truncation = True,
+ return_tensors= 'pt')
+
+ model_output = model(**encodings)
+ embeddings = self._mean_pooling(model_output, encodings['attention_mask'])
+ return embeddings, encodings.input_ids.numel()
diff --git a/samples/apps/hugging-face-http-server/utils/ImageGenerator.py b/samples/apps/hugging-face-http-server/utils/ImageGenerator.py
new file mode 100644
index 000000000000..3a6027ae1bff
--- /dev/null
+++ b/samples/apps/hugging-face-http-server/utils/ImageGenerator.py
@@ -0,0 +1,35 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+from diffusers import DiffusionPipeline
+import base64
+from . import InferenceGenerator
+from io import BytesIO
+# The model used to get the tokenizer can be a little arbitrary
+# since the tokenizers are common within the same model type
+
+class ImageGenerator(InferenceGenerator.InferenceGenerator):
+ def __init__(self, model_name):
+ super().__init__(model_name)
+ self.default_size = 512
+
+ def perform_inference(self, prompt, num_images, size):
+ generator = DiffusionPipeline.from_pretrained(self.model_name)
+ generator.to(self.device)
+
+ height = self.default_size
+ width = self.default_size
+
+ if size is not None:
+ tmp = size.split("x")
+ height = int(tmp[0])
+ width = int(tmp[1])
+
+ images = generator([prompt] * num_images, height=height, width=width).images
+
+ b64_images = []
+ for image in images:
+ buffered = BytesIO()
+ image.save(buffered, format="PNG")
+ base64_image = base64.b64encode(buffered.getvalue())
+ b64_images.append({"b64_json": base64_image.decode()})
+ return b64_images
diff --git a/samples/apps/hugging-face-http-server/utils/InferenceGenerator.py b/samples/apps/hugging-face-http-server/utils/InferenceGenerator.py
new file mode 100644
index 000000000000..db5ba93eb9d4
--- /dev/null
+++ b/samples/apps/hugging-face-http-server/utils/InferenceGenerator.py
@@ -0,0 +1,14 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import os
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+# The model used to get the tokenizer can be a little arbitrary
+# since the tokenizers are common within the same model type
+
+class InferenceGenerator():
+ def __init__(self, model_name):
+ os.environ['TOKENIZERS_PARALLELISM'] = "false"
+ self.model_name = model_name
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
diff --git a/samples/apps/hugging-face-http-server/utils/create_responses.py b/samples/apps/hugging-face-http-server/utils/create_responses.py
new file mode 100644
index 000000000000..d92043e54761
--- /dev/null
+++ b/samples/apps/hugging-face-http-server/utils/create_responses.py
@@ -0,0 +1,43 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+from datetime import datetime
+
+# These responses are modeled after the OpenAI REST API
+
+def create_completion_response(completion_text, model, num_prompt_tokens, num_completion_tokens):
+ data = [{
+ "generated_text": completion_text
+ }]
+ return data
+
+def create_embedding_indices(embeddings):
+ index = 0
+ data_entries = []
+ for embedding in embeddings:
+ data_entries.append({
+ "object": "embedding",
+ "index": index,
+ "embedding": embedding.tolist()
+ })
+ index = index + 1
+ return data_entries
+
+def create_embedding_response(embeddings, num_prompt_tokens):
+ data_entries = create_embedding_indices(embeddings)
+ data = {
+ "object": "list",
+ "data": data_entries,
+ "usage": {
+ "prompt_tokens": num_prompt_tokens,
+ "total_tokens": num_prompt_tokens
+ }
+ }
+ return data
+
+
+def create_image_gen_response(image_data):
+ data = {
+ "created": datetime.now(),
+ "data": image_data
+ }
+ return data
diff --git a/samples/dotnet/kernel-syntax-examples/Example20_HuggingFace.cs b/samples/dotnet/kernel-syntax-examples/Example20_HuggingFace.cs
new file mode 100644
index 000000000000..68fca8c88557
--- /dev/null
+++ b/samples/dotnet/kernel-syntax-examples/Example20_HuggingFace.cs
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System;
+using System.Threading.Tasks;
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.Connectors.HuggingFace.TextCompletion;
+using RepoUtils;
+
+/**
+ * The following example shows how to use Semantic Kernel with HuggingFace API.
+ */
+
+// ReSharper disable once InconsistentNaming
+public static class Example20_HuggingFace
+{
+ public static async Task RunAsync()
+ {
+ Console.WriteLine("======== HuggingFace text completion AI ========");
+
+ IKernel kernel = new KernelBuilder().WithLogger(ConsoleLogger.Log).Build();
+
+ // Add HuggingFace text completion service
+ kernel.Config.AddTextCompletionService("hf-text-completion", (kernel) => new HuggingFaceTextCompletion(Env.Var("HF_API_KEY"), "gpt2"));
+
+ const string FUNCTION_DEFINITION = "Question: {{$input}}; Answer:";
+
+ var questionAnswerFunction = kernel.CreateSemanticFunction(FUNCTION_DEFINITION);
+
+ var result = await questionAnswerFunction.InvokeAsync("What is New York?");
+
+ Console.WriteLine(result);
+ }
+}
diff --git a/samples/dotnet/kernel-syntax-examples/Program.cs b/samples/dotnet/kernel-syntax-examples/Program.cs
index 8799c3c1507e..4e0c993d56ea 100644
--- a/samples/dotnet/kernel-syntax-examples/Program.cs
+++ b/samples/dotnet/kernel-syntax-examples/Program.cs
@@ -65,6 +65,9 @@ public static async Task Main()
await Example19_Qdrant.RunAsync();
Console.WriteLine("== DONE ==");
+
+ await Example20_HuggingFace.RunAsync();
+ Console.WriteLine("== DONE ==");
}
}
#pragma warning restore CS1591