update README & code

wondervictor · wondervictor · commit ee9fcd7e8a66 · 2025-05-12T20:26:15.000+08:00
diff --git a/LongCoT/LongCoT.ipynb b/LongCoT/LongCoT.ipynb
@@ -180,7 +180,7 @@
    "source": [
     "# Please set the API key here\n",
     "os.environ['OPENAI_API_KEY'] = 'your api key'\n",
-    "seed_vl_version = \"doubao-1.5-vision-pro-250328\"\n",
+    "seed_vl_version = \"doubao-1-5-thinking-vision-pro-250428\"\n",
     "client = OpenAI(\n",
     "    base_url=\"https://ark.cn-beijing.volces.com/api/v3\",\n",
     "    api_key=os.environ.get(\"OPENAI_API_KEY\"),\n",
diff --git a/README.md b/README.md
@@ -1,34 +1,32 @@
-<div align="center">
- 👋 Hi, everyone! 
-    <br>
-    We are <b>ByteDance Seed team.</b>
+<div>
+<center>
+<img src="./assets/banner.png" width=400>
+</center>
 </div>
 
-
 <p align="center">
-  You can get to know us better through the following channels👇
-  <br>
-  <a href="https://seed.bytedance.com/">
-    <img src="https://img.shields.io/badge/Website-%231e37ff?style=for-the-badge&logo=bytedance&logoColor=white"></a>
-  <a href="https://github.com/user-attachments/assets/5793e67c-79bb-4a59-811a-fcc7ed510bd4">
-    <img src="https://img.shields.io/badge/WeChat-07C160?style=for-the-badge&logo=wechat&logoColor=white"></a>
- <a href="https://www.xiaohongshu.com/user/profile/668e7e15000000000303157d?xsec_token=ABl2-aqekpytY6A8TuxjrwnZskU-6BsMRE_ufQQaSAvjc%3D&xsec_source=pc_search">
-    <img src="https://img.shields.io/badge/Xiaohongshu-%23FF2442?style=for-the-badge&logo=xiaohongshu&logoColor=white"></a>
-  <a href="https://www.zhihu.com/org/dou-bao-da-mo-xing-tuan-dui/">
-    <img src="https://img.shields.io/badge/zhihu-%230084FF?style=for-the-badge&logo=zhihu&logoColor=white"></a>
+🌐 <a href=""> Homepage (upcoming)</a>&nbsp&nbsp | &nbsp&nbsp🤗 <a href="https://huggingface.co">Hugging Face (upcoming)</a>&nbsp&nbsp | &nbsp&nbsp📄 <a href="https://arxiv.org/abs/">arXiv (upcoming)</a>
 </p>
 
-![seed logo](./assets/logo.jpg)
+## 🌟 Highlights
 
-# Seed1.5-VL Cookbook
+* Seed1.5-VL is a vision-language foundation model featuring a 532M-parameter vision encoder and a 20B active parameter Mixture-of-Experts (MoE) LLM, designed to advance general-purpose multimodal understanding and reasoning.
+
+* Seed1.5-VL delivers strong performance across numerous public benchmarks, achieving state-of-the-art results in areas including multimodal reasoning and agent-centric tasks.
+
+* This repository offers usage cookbook and best practices designed to help developers effectively use Seed1.5-VL.
 
-Welcome to the **Seed1.5-VL** API Cookbook! This collection of code samples is designed to help you get started with using the Seed1.5-VL API. Our flagship Seed1.5-VL has been deployed on [Volcano Engine](https://www.volcengine.com/product/doubao). After obtaining your `API_KEY`, you can use the examples in this cookbook to rapidly understand and leverage the diverse capabilities of our Seed1.5-VL.
 
-## News
+## 📢 News
 * `2025-05-12:` We have released the [Seed1.5-VL Technical Report](./Seed1.5-VL-Technical-Report.pdf).
 * `2025-05-12`: We are extremely delighted to release the flagship Seed1.5-VL on Volcano Engine. The model id is `doubao-1-5-thinking-vision-pro-250428`. You can try it now!
 
-## Quick Start
+
+## 📖 Seed1.5-VL Cookbook
+
+Welcome to the **Seed1.5-VL** API Cookbook! This collection of code samples is designed to help you get started with using the Seed1.5-VL API. Our flagship Seed1.5-VL has been deployed on [Volcano Engine](https://www.volcengine.com/product/doubao). After obtaining your `API_KEY`, you can use the examples in this cookbook to rapidly understand and leverage the diverse capabilities of our Seed1.5-VL.
+
+### Quick Start
 
 - [x] Cookbook for online/offline [Gradio Demo](./GradioDemo)
 - [x] Cookbook for turning on/off [LongCoT](./longCoT)
diff --git a/Video/video_understanding.ipynb b/Video/video_understanding.ipynb
@@ -55,16 +55,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "ed96287d-8fd1-454b-9bfd-9f0eaff6c56e",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# 定义抽帧策略枚举类\n",
     "class Strategy(Enum):\n",
-    "    # 固定间隔抽帧策略，例如每1秒抽一帧\n",
+    "    # sampling stragegies\n",
+    "    # constant interval: sampling at a constant interval, fps sampling\n",
     "    CONSTANT_INTERVAL = \"constant_interval\"\n",
-    "    # 均匀间隔抽帧策略，根据设定的最大帧数均匀从视频全长度抽取\n",
+    "    # even interval: sampling at an even interval, uniform sampling\n",
     "    EVEN_INTERVAL = \"even_interval\""
    ]
   },
@@ -78,7 +78,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "id": "974013e8-6436-403f-a5a3-fa245f322939",
    "metadata": {},
    "outputs": [],
@@ -92,142 +92,87 @@
     "        use_timestamp: bool = True,\n",
     "        keyframe_naming_template: str = \"frame_{:04d}.jpg\",\n",
     ") -> list[str]:\n",
-    "    \"\"\"将视频按照指定策略抽帧\n",
-    "    参数:\n",
-    "        video_file_path (str): 视频文件路径\n",
-    "        output_dir (str): 输出目录\n",
-    "        extraction_strategy (Optional[Strategy], optional): 抽帧策略。\n",
-    "             固定间隔 比如 1s 抽一帧 或\n",
-    "             均匀间隔 根据设定的最大帧数 均匀从视频全长度均匀抽取\n",
-    "             默认固定间隔 1s 抽一帧\n",
-    "        interval_in_seconds (Optional[float], optional): 固定间隔抽帧的间隔时间. 默认 1s 抽一帧\n",
-    "        max_frames (Optional[int], optional): 最大抽帧帧数. 默认 10 帧\n",
-    "        use_timestamp (bool): 是否输出视频时间戳, 默认True\n",
-    "        keyframe_naming_template (_type_, optional): 抽帧图片命名模板\n",
-    "    返回:\n",
-    "        list[str]: 抽帧图片路径列表\n",
-    "        list[float]: 视频采样帧对应的时间戳\n",
+    "    \"\"\"sampling videos and extract keyframes with different strategies.\n",
+    "    Args:\n",
+    "        video_file_path (str): video path\n",
+    "        output_dir (str): output directory for sampled keyframes\n",
+    "        extraction_strategy (Optional[Strategy], optional): extraction strategy. Defaults to Strategy.EVEN_INTERVAL.\n",
+    "        interval_in_seconds (Optional[float], optional): the sampling interval\n",
+    "        max_frames (Optional[int], optional): maximum number of sampled frames. Defaults to 10.\n",
+    "        use_timestamp (bool): whether to output video timestamps. Defaults to True.\n",
+    "        keyframe_naming_template (_type_, optional): keyframe naming template. Defaults to \"frame_{:04d}.jpg\".\n",
+    "    Returns:\n",
+    "        list[str]: sampled keyframe paths\n",
+    "        list[float]: timestamps of sampled keyframes\n",
     "    \"\"\"\n",
-    "    # 检查输出目录是否存在，如果不存在则创建\n",
     "    if not os.path.exists(output_dir):\n",
     "        os.makedirs(output_dir)\n",
-    "    # 使用OpenCV打开视频文件\n",
     "    cap = cv2.VideoCapture(video_file_path)\n",
-    "    # 获取视频的帧率\n",
     "    fps = cap.get(cv2.CAP_PROP_FPS)\n",
-    "    # 获取视频的总帧数\n",
     "    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\n",
     "\n",
-    "    # 根据策略选择抽帧间隔\n",
     "    if extraction_strategy == Strategy.CONSTANT_INTERVAL:\n",
-    "        # 计算固定间隔抽帧的帧间隔\n",
     "        frame_interval = int(fps * interval_in_seconds)\n",
     "    elif extraction_strategy == Strategy.EVEN_INTERVAL:\n",
-    "        # 计算均匀间隔抽帧的帧间隔\n",
     "        frame_interval = int(length / max_frames)\n",
     "    else:\n",
-    "        # 如果策略无效，抛出异常\n",
     "        raise ValueError(\"Invalid extraction strategy\")\n",
-    "    # 初始化帧计数器\n",
     "    frame_count = 0\n",
-    "    # 初始化关键帧列表\n",
     "    keyframes = []\n",
     "    timestamps = []\n",
-    "    # 循环读取视频帧\n",
     "    while True:\n",
-    "        # 读取一帧\n",
     "        ret, frame = cap.read()\n",
-    "        # 如果读取失败，跳出循环\n",
     "        if not ret:\n",
     "            break\n",
-    "        # 如果当前帧是关键帧\n",
     "        if frame_count % frame_interval == 0:\n",
-    "            # 生成关键帧的文件名\n",
     "            image_path = os.path.join(\n",
     "                output_dir, keyframe_naming_template.format(len(keyframes))\n",
     "            )\n",
-    "            # 将关键帧保存为图片\n",
     "            cv2.imwrite(\n",
     "                image_path,\n",
     "                frame,\n",
     "            )\n",
-    "            # 将关键帧路径添加到列表中\n",
     "            keyframes.append(image_path)\n",
     "            timestamps.append(round(frame_count / fps, 1))\n",
-    "        # 增加帧计数器\n",
     "        frame_count += 1\n",
-    "        # 如果关键帧数量达到最大值，跳出循环\n",
     "        if len(keyframes) >= max_frames:\n",
     "            break\n",
     "\n",
-    "    print(\"抽取帧数:\", len(keyframes))\n",
-    "    # 返回关键帧路径列表\n",
+    "    print(\"sampled frames:\", len(keyframes))\n",
     "    if use_timestamp:\n",
     "        return keyframes, timestamps\n",
     "    return keyframes, None\n",
     "\n",
     "def resize(image):\n",
-    "    \"\"\"\n",
-    "    调整图片大小以适应指定的尺寸。\n",
-    "    参数:\n",
-    "        image (numpy.ndarray): 输入的图片，格式为numpy数组。\n",
-    "    返回:\n",
-    "        numpy.ndarray: 调整大小后的图片。\n",
-    "    \"\"\"\n",
-    "    # 获取图片的原始高度和宽度\n",
     "    height, width = image.shape[:2]\n",
-    "    # 根据图片的宽高比确定目标尺寸\n",
     "    if height < width:\n",
     "        target_height, target_width = 480, 640\n",
     "    else:\n",
     "        target_height, target_width = 640, 480\n",
-    "    # 如果图片尺寸已经小于或等于目标尺寸，则直接返回原图片\n",
     "    if height <= target_height and width <= target_width:\n",
     "        return image\n",
-    "    # 计算新的高度和宽度，保持图片的宽高比\n",
     "    if height / target_height < width / target_width:\n",
     "        new_width = target_width\n",
     "        new_height = int(height * (new_width / width))\n",
     "    else:\n",
     "        new_height = target_height\n",
     "        new_width = int(width * (new_height / height))\n",
-    "    # 调整图片大小\n",
     "    return cv2.resize(image, (new_width, new_height))\n",
     "\n",
-    "# 定义方法将指定路径图片resize到合适大小并转为Base64编码\n",
     "def encode_image(image_path: str) -> str:\n",
-    "    \"\"\"\n",
-    "    将指定路径的图片进行编码\n",
-    "    参数:\n",
-    "        image_path (str): 图片文件的路径\n",
-    "    返回:\n",
-    "        str: 编码后的图片字符串\n",
-    "    \"\"\"\n",
-    "    # 读取图片\n",
     "    image = cv2.imread(image_path)\n",
-    "    # 调整图片大小\n",
     "    image_resized = resize(image)\n",
-    "    # 将图片编码为JPEG格式\n",
     "    _, encoded_image = cv2.imencode(\".jpg\", image_resized)\n",
-    "    # 将编码后的图片转换为Base64字符串\n",
     "    return base64.b64encode(encoded_image).decode(\"utf-8\")\n",
     "\n",
     "def construct_messages(image_paths: list[str], timestamps: list[float], prompt: str) -> list[dict]:\n",
     "    \"\"\"\n",
-    "    构造包含文本和图像的消息列表。\n",
-    "    参数:\n",
-    "        image_paths (list[str]): 图像文件路径列表。\n",
-    "        timestamps (list[float]): 视频的时间戳。\n",
-    "        prompt (str): 文本提示。\n",
-    "    返回:\n",
-    "        list[dict]: 包含文本和图像的消息列表。\n",
+    "    construct messages for the video understanding\n",
     "    \"\"\"\n",
-    "    # 初始化消息内容列表\n",
     "    content = []\n",
-    "    # 遍历图像路径列表\n",
     "    for idx, image_path in enumerate(image_paths):\n",
-    "        # 为每个图像路径构造一个图像URL消息\n",
     "        if timestamps is not None:\n",
+    "            # add timestamp for each frame\n",
     "            content.append({\n",
     "                \"type\": \"text\",\n",
     "                \"text\": f'[{timestamps[idx]} second]'\n",
@@ -236,9 +181,7 @@
     "            {\n",
     "                \"type\": \"image_url\",\n",
     "                \"image_url\": {\n",
-    "                    # 使用Base64编码将图像转换为数据URL\n",
     "                    \"url\": f\"data:image/jpeg;base64,{encode_image(image_path)}\",\n",
-    "                    # 指定图像细节级别为低\n",
     "                    \"detail\":\"low\"\n",
     "                },\n",
     "            }\n",
@@ -248,7 +191,6 @@
     "            \"type\": \"text\",\n",
     "            \"text\": prompt,\n",
     "    })\n",
-    "    # 返回包含文本和图像的消息列表\n",
     "    return [\n",
     "        {\n",
     "            \"role\": \"user\",\n",
@@ -274,7 +216,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "id": "f48fd468-12d6-46c9-ae19-c2c981bdc6c2",
    "metadata": {},
    "outputs": [
@@ -324,11 +266,12 @@
     "# sampling video frames\n",
     "sampling_fps = 1\n",
     "max_frames = 30\n",
+    "sampling_interval = 1.0 / sampling_fps\n",
     "selected_images, timestamps = preprocess_video(\n",
     "    video_file_path=video_path,\n",
     "    output_dir=\"video_frames\",\n",
     "    extraction_strategy=Strategy.CONSTANT_INTERVAL,\n",
-    "    interval_in_seconds=sampling_fps,\n",
+    "    interval_in_seconds=sampling_interval,\n",
     "    use_timestamp=True,\n",
     "    max_frames=max_frames\n",
     ")\n",
@@ -348,7 +291,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "id": "89845b90-e976-45c1-84cd-7239963101ee",
    "metadata": {},
    "outputs": [
@@ -369,11 +312,12 @@
     "# sampling video frames\n",
     "sampling_fps = 1\n",
     "max_frames = 30\n",
+    "sampling_interval = 1.0 / sampling_fps\n",
     "selected_images, timestamps = preprocess_video(\n",
     "    video_file_path=video_path,\n",
     "    output_dir=\"video_frames\",\n",
     "    extraction_strategy=Strategy.CONSTANT_INTERVAL,\n",
-    "    interval_in_seconds=sampling_fps,\n",
+    "    interval_in_seconds=sampling_interval,\n",
     "    use_timestamp=True,\n",
     "    max_frames=max_frames\n",
     ")\n",
@@ -393,7 +337,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "id": "2c4fbac9-5c82-447b-b174-bec330bd70df",
    "metadata": {},
    "outputs": [
@@ -418,11 +362,12 @@
     "# sampling video frames\n",
     "sampling_fps = 1\n",
     "max_frames = 30\n",
+    "sampling_interval = 1.0 / sampling_fps\n",
     "selected_images, timestamps = preprocess_video(\n",
     "    video_file_path=video_path,\n",
     "    output_dir=\"video_frames\",\n",
     "    extraction_strategy=Strategy.CONSTANT_INTERVAL,\n",
-    "    interval_in_seconds=sampling_fps,\n",
+    "    interval_in_seconds=sampling_interval,\n",
     "    use_timestamp=True,\n",
     "    max_frames=max_frames\n",
     ")\n",
diff --git a/assets/banner.png b/assets/banner.png