GenAI > Nova > Nova - 多模态理解

Nova - 多模态理解

使用之前需要把这三个模型申请权限：

media/model_access.png

它的请求参数如下：

{
  "system": [
    {
      "text": string
    }
  ],
  "messages": [
    {
      "role": "user",# first turn should always be the user turn
      "content": [
        {
          "text": string
        },
        {
          "image": {
            "format": "jpeg"| "png" | "gif" | "webp",
            "source": {
              "bytes": "base64EncodedImageDataHere..."#  base64-encoded binary
            }
          }
        },
        {
          "video": {
            "format": "mkv" | "mov" | "mp4" | "webm" | "three_gp" | "flv" | "mpeg" | "mpg" | "wmv",
            "source": {
            # source can be s3 location of base64 bytes based on size of input file. 
               "s3Location": {
                "uri": string, #  example: s3://my-bucket/object-key
                "bucketOwner": string #  (Optional) example: 123456789012)
               }
              "bytes": "base64EncodedImageDataHere..." #  base64-encoded binary
            }
          }
        },
      ]
    },
    {
      "role": "assistant",
      "content": [
        {
          "text": string # prefilling assistant turn
        }
      ]
    }
  ],
 "inferenceConfig":{ # all Optional
    "max_new_tokens": int, #  greater than 0, equal or less than 5k (default: dynamic*)
    "temperature": float, # greater then 0 and less than 1.0 (default: 0.7)
    "top_p": float, #  greater than 0, equal or less than 1.0 (default: 0.9)
    "top_k": int #  0 or greater (default: 50)
    "stopSequences": [string]
  },
  "toolConfig": { #  all Optional
        "tools": [
                {
                    "toolSpec": {
                        "name": string # menaingful tool name (Max char: 64)
                        "description": string # meaningful description of the tool
                        "inputSchema": {
                            "json": { # The JSON schema for the tool. For more information, see JSON Schema Reference
                                "type": "object",
                                "properties": {
                                    <args>: { # arguments 
                                        "type": string, # argument data type
                                        "description": string # meaningful description
                                    }
                                },
                                "required": [
                                    string # args
                                ]
                            }
                        }
                    }
                }
            ],
   "toolChoice": "any" //Amazon Nova models ONLY support tool choice of "any"
        }
    }
}

文本理解

以下示例使用 Nova Lite 进行文本理解。

invoke_model API 调用

以下代码调用nova lite，来讲个笑话：

from random import randint
import json
import boto3
import base64

client = boto3.client(service_name='bedrock-runtime', region_name="us-east-1")

PRO_MODEL_ID = "us.amazon.nova-pro-v1:0"
LITE_MODEL_ID = "us.amazon.nova-lite-v1:0"
MICRO_MODEL_ID = "us.amazon.nova-micro-v1:0"

native_request = {
    "messages": [  # Define one or more messages using the "user" and "assistant" roles.
        {"role": "user", "content": [{"text": "tell me a joke"}]},
    ],
    "system": [
        {"text": "You should respond to all messages in Chinese"}
    ],
    "inferenceConfig": {"max_new_tokens": 300, "top_p": 0.9, "top_k": 20, "temperature": 0.7},
}

# Invoke the model and extract the response body.
response = client.invoke_model(modelId=LITE_MODEL_ID, body=json.dumps(native_request))

model_response = json.loads(response["body"].read())

# Print the text content for easy readability.
content_text = model_response["output"]["message"]["content"][0]["text"]
print("\n[Response Content Text]")
print(content_text)

Streaming API 调用

下面的示例演示了如何使用基于文本的流式 API：

import json
import boto3
import base64
from datetime import datetime

client = boto3.client(service_name='bedrock-runtime', region_name="us-east-1")

PRO_MODEL_ID = "us.amazon.nova-pro-v1:0"
LITE_MODEL_ID = "us.amazon.nova-lite-v1:0"
MICRO_MODEL_ID = "us.amazon.nova-micro-v1:0"

request_body = {
    "messages": [{"role": "user", "content": [{"text": "A camping trip"}]}],
    "system": [
        {
            "text": "Act as a creative writing assistant. When the user provides you with a topic, write a short story about that topic."}
    ],
    "inferenceConfig": {"max_new_tokens": 500, "top_p": 0.9, "top_k": 20, "temperature": 0.7},
}

start_time = datetime.now()

# Invoke the model with the response stream
response = client.invoke_model_with_response_stream(
    modelId=LITE_MODEL_ID, body=json.dumps(request_body)
)

print("Awaiting first token...")

chunk_count = 0
time_to_first_token = None

# Process the response stream
stream = response.get("body")
if stream:
    for event in stream:
        chunk = event.get("chunk")
        if chunk:
            # Print the response chunk
            chunk_json = json.loads(chunk.get("bytes").decode())
            # Pretty print JSON
            # print(json.dumps(chunk_json, indent=2, ensure_ascii=False))
            content_block_delta = chunk_json.get("contentBlockDelta")
            if content_block_delta:
                if time_to_first_token is None:
                    time_to_first_token = datetime.now() - start_time
                    print(f"Time to first token: {time_to_first_token}")

                chunk_count += 1
                current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S:%f")
                # print(f"{current_time} - ", end="")
                print(content_block_delta.get("delta").get("text"), end="")
    print(f"Total chunks: {chunk_count}")
else:
    print("No response stream received.")

图像理解

让我们看看 Nova 模型在图像理解用例上的表现。在这里，我们将传递一张日落图像，并要求模型尝试为该图像创建 3 个艺术标题。

A Sunset Image

将上面图片保存为sunset.png，放在代码同一目录并运行：

import json
import boto3
import base64
from datetime import datetime

client = boto3.client(service_name='bedrock-runtime', region_name="us-east-1")

PRO_MODEL_ID = "us.amazon.nova-pro-v1:0"
LITE_MODEL_ID = "us.amazon.nova-lite-v1:0"
MICRO_MODEL_ID = "us.amazon.nova-micro-v1:0"

with open("sunset.png", "rb") as image_file:
    binary_data = image_file.read()
    base_64_encoded_data = base64.b64encode(binary_data)
    base64_string = base_64_encoded_data.decode("utf-8")

native_request = {
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "image": {
                        "format": "png",
                        "source": {"bytes": base64_string},
                    }
                },
                {"text": "Provide art titles for this image."},
            ],
        }
    ],
    "system": [
        {"text": "You are an expert artist. When the user provides you with an image, provide 3 potential art titles"}
    ],
    "inferenceConfig": {"max_new_tokens": 300, "top_p": 0.1, "top_k": 20, "temperature": 0.3},
}

# Invoke the model and extract the response body.
response = client.invoke_model(modelId=LITE_MODEL_ID, body=json.dumps(native_request))
model_response = json.loads(response["body"].read())


# Print the text content for easy readability.
content_text = model_response["output"]["message"]["content"][0]["text"]
print("\n[Response Content Text]")
print(content_text)

结果：

视频理解

现在，让我们看看 Nova 在视频理解用例方面的表现：

下载视频：

wget https://pingfan.s3.amazonaws.com/files/the-sea.mp4

使用Nova分析视频，并为它起标题：

import json
import boto3
import base64

client = boto3.client(service_name='bedrock-runtime', region_name="us-east-1")

PRO_MODEL_ID = "us.amazon.nova-pro-v1:0"
LITE_MODEL_ID = "us.amazon.nova-lite-v1:0"
MICRO_MODEL_ID = "us.amazon.nova-micro-v1:0"

with open("the-sea.mp4", "rb") as video_file:
    binary_data = video_file.read()
    base_64_encoded_data = base64.b64encode(binary_data)
    base64_string = base_64_encoded_data.decode("utf-8")

native_request = {
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "video": {
                        "format": "mp4",
                        "source": {"bytes": base64_string},
                    }
                },
                {"text": "Provide video titles for this clip."},
            ],
        }
    ]
    ,
    "system": [
        {
            "text": "You are an expert media analyst. When the user provides you with a video, provide 3 potential video titles"}
    ],
    "inferenceConfig": {"max_new_tokens": 300, "top_p": 0.1, "top_k": 20, "temperature": 0.3},
}

# Invoke the model and extract the response body.
response = client.invoke_model(modelId=LITE_MODEL_ID, body=json.dumps(native_request))
model_response = json.loads(response["body"].read())

# Pretty print the response JSON.
print("[Full Response]")
print(json.dumps(model_response, indent=2))

# Print the text content for easy readability.
content_text = model_response["output"]["message"]["content"][0]["text"]
print("\n[Response Content Text]")
print(content_text)