使用 Midscene.js 获取 APP 页面数据构建 AI 知识库

txm

520人浏览 · 2026-03-06 11:01:57

txm · 2026-03-06 11:01:57 发布

背景

在 AI 应用开发中，我们经常需要将 APP 的界面内容转换为结构化的文本数据，用于构建知识库、训练数据或者辅助 AI 理解移动端界面。传统方式需要解析 Android View Hierarchy 或者使用 uiautomatorviewer，过程繁琐且难以维护。

本文介绍如何使用 Midscene.js 实现自动化提取 APP 页面数据，通过 AI 视觉理解能力直接将页面截图转换为 Markdown 格式的结构化文档。

Midscene.js 简介

Midscene.js 是一个视觉驱动的 UI 自动化框架，支持 Web、Android、iOS 等多平台。其核心特点是纯视觉方案——不需要获取 DOM 树或 accessibility tree，仅通过截图就能让 AI 理解界面并执行操作。

主要特性：

支持 Android/iOS/Web/桌面应用
AI 驱动的元素定位和操作
自然语言交互
支持多种视觉大模型（QwenVL、Gemini、Doubao 等）

环境准备

1. 安装 Node.js

确保安装 Node.js 18+ 版本。

2. 安装 ADB

Android 调试桥，用于连接真机或模拟器：


# Windows (使用 Chocolatey)

choco install adb



# macOS

brew install android-platform-tools



# 验证连接

adb devices

3. 配置模型

创建 .env 文件配置视觉模型：


# 方案1: OpenRouter (推荐)

MIDSCENE_MODEL_API_KEY=your-api-key

MIDSCENE_MODEL_NAME=qwen/qwen3-vl-235b-a22b-instruct

MIDSCENE_MODEL_BASE_URL=https://openrouter.ai/api/v1

MIDSCENE_MODEL_FAMILY=qwen3-vl



# 方案2: 阿里云 DashScope

# MIDSCENE_MODEL_API_KEY=your-api-key

# MIDSCENE_MODEL_NAME=qwen-vl-plus

# MIDSCENE_MODEL_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1

# MIDSCENE_MODEL_FAMILY=qwen-vl

4. 安装依赖


npm init -y

npm install @midscene/android dotenv

核心实现

脚本功能

我们的脚本实现以下功能：

连接 Android 设备
获取当前页面 Activity
截取页面截图
使用 AI 提取页面结构（标题、各区块内容）
识别可交互元素（按钮、标签、列表项等）
生成 Markdown 格式文档
支持自定义目录和文件名

完整代码


require('dotenv').config();



const { AndroidAgent, AndroidDevice, getConnectedDevices } = require('@midscene/android');

const fs = require('fs');

const path = require('path');

const { execSync } = require('child_process');



// 检查环境配置

const apiKey = process.env.MIDSCENE_MODEL_API_KEY;

const modelName = process.env.MIDSCENE_MODEL_NAME;



if (!apiKey || !modelName) {

  console.error('请配置模型环境变量');

  process.exit(1);

}



async function extractPage(deviceId, outputDir = './output', customPath = null) {

  // 1. 连接设备

  const devices = await getConnectedDevices();

  if (devices.length === 0) {

    console.error('未找到设备');

    process.exit(1);

  }



  const targetDevice = deviceId || devices[0].udid;

  const device = new AndroidDevice(targetDevice);

  await device.connect();



  const agent = new AndroidAgent(device);



  // 创建输出目录

  if (!fs.existsSync(outputDir)) {

    fs.mkdirSync(outputDir, { recursive: true });

  }



  console.log('开始提取页面数据...');



  // 2. 获取页面标题

  const titleResult = await agent.aiQuery(`{title: string}`);

  const pageTitle = titleResult?.title || '未知页面';

  const safeTitle = pageTitle.replace(/[<>:"/\\|?*]/g, '_').trim();



  // 3. 获取当前 Activity

  let currentActivity = '';

  try {

    const adbPath = process.env.ANDROID_ADB_PATH || 'adb';

    const output = execSync(`${adbPath} -s ${targetDevice} shell dumpsys activity activities | findstr "mResumedActivity"`, { encoding: 'utf-8' });

    const match = output.match(/([\w.]+)\/([\w.]+)/);

    if (match) currentActivity = match[0];

  } catch (e) {}



  // 4. 截图

  const now = new Date();

  const timestamp = [

    String(now.getFullYear()).slice(2),

    String(now.getMonth() + 1).padStart(2, '0'),

    String(now.getDate()).padStart(2, '0'),

    String(now.getHours()).padStart(2, '0'),

    String(now.getMinutes()).padStart(2, '0'),

    String(now.getSeconds()).padStart(2, '0')

  ].join('');



  const screenshotName = `${safeTitle}_${timestamp}.png`;

  const screenshotPath = path.join(outputDir, screenshotName);



  try {

    const adbPath = process.env.ANDROID_ADB_PATH || 'adb';

    execSync(`${adbPath} -s ${targetDevice} shell screencap -p /sdcard/screenshot.png`);

    execSync(`${adbPath} -s ${targetDevice} pull /sdcard/screenshot.png "${screenshotPath}"`);

    execSync(`${adbPath} -s ${targetDevice} shell rm /sdcard/screenshot.png`);

    console.log('截图完成');

  } catch (e) {

    console.log('截图失败');

  }



  // 5. 提取页面结构

  const pageContent = await agent.aiQuery(`

    {title: string, sections: {heading: string, content: string}[]},

    分析页面内容，提取标题和各个区块

  `);



  // 6. 提取可交互元素

  const elements = await agent.aiQuery(`

    {elements: [{type: string, label: string, location: string}]},

    找出页面中所有可点击的元素

  `);



  // 7. 提取页面描述

  const description = await agent.aiAsk(`简洁描述这个页面是做什么的，不超过50字`);



  // 8. 生成 Markdown

  let mdFileName, mdFilePath;

  if (customPath && customPath.length > 0) {

    const dirName = customPath[0];

    const fileName = customPath.slice(1).join('_') || safeTitle;

    const fullDir = path.join(outputDir, dirName);

    if (!fs.existsSync(fullDir)) fs.mkdirSync(fullDir, { recursive: true });

    mdFileName = `${fileName}.md`;

    mdFilePath = path.join(fullDir, mdFileName);

  } else {

    mdFileName = `${safeTitle}.md`;

    mdFilePath = path.join(outputDir, mdFileName);

  }



  let markdown = `# ${pageTitle}\n\n---\n\n`;

  markdown += `> 生成时间: ${new Date().toLocaleString()}\n`;

  markdown += `> Activity: ${currentActivity}\n\n---\n\n`;

  markdown += `## 页面概述\n\n${description}\n\n---\n\n`;

  markdown += `## 页面结构\n\n`;



  if (pageContent?.sections) {

    for (const section of pageContent.sections) {

      markdown += `### ${section.heading}\n\n${section.content}\n\n`;

    }

  }



  markdown += `---\n\n## 可交互元素\n\n`;

  markdown += `| 位置 | 类型 | 标签 |\n| --- | --- | --- |\n`;



  if (elements?.elements) {

    for (const el of elements.elements) {

      markdown += `| ${el.location || '-'} | ${el.type || '-'} | ${el.label || '-'} |\n`;

    }

  }



  markdown += `\n---\n\n## 页面截图\n\n![截图](./${screenshotName})\n`;



  fs.writeFileSync(mdFilePath, markdown, 'utf-8');

  console.log(`已保存: ${mdFilePath}`);



  await device.close();

}



// 解析命令行参数

const args = process.argv.slice(2);

let deviceId, outputDir = './output', customPath = null;



for (let i = 0; i < args.length; i++) {

  if (args[i] === '--device' && args[i + 1]) { deviceId = args[i + 1]; i++; }

  else if (args[i] === '--output' && args[i + 1]) { outputDir = args[i + 1]; i++; }

  else if (!args[i].startsWith('--')) { customPath = args; break; }

}



extractPage(deviceId, outputDir, customPath).catch(console.error);

使用方法

基本用法


# 无参数 - 使用页面标题作为文件名

node extract.js

自定义路径


# 第一个参数为目录名，后续为文件名（用下划线连接）

node extract.js 菜单 首页

# 输出: output/菜单/首页.md



node extract.js 详情 榜单 xx榜.md

# 输出: output/详情/榜单_xx榜.md

指定设备和输出目录


node extract.js --device emulator-5554 --output ./docs

输出示例

生成的 Markdown 文件格式如下：


# 榜单页



---



> 生成时间: 2026/3/6 10:15:04

> Activity: com.example.app/.ui.RankActivity



---



## 页面概述



展示各类榜单数据的页面



---



## 页面结构



### 顶部导航栏



榜单类型切换：本周榜、月榜、总榜



### 榜单列表



显示用户排名、头像、昵称、分数



---



---



## 可交互元素



| 位置 | 类型 | 标签 |

| --- | --- | --- |

| 顶部导航栏 | tab | 本周榜 |

| 顶部导航栏 | tab | 月榜 |

| 顶部导航栏 | tab | 总榜 |

| 列表项 | list_item | 用户A |

| 列表项 | list_item | 用户B |

| 底部导航栏 | tab | 首页 |

| 底部导航栏 | tab | 我的 |



---



## 页面截图



![截图](./榜单页_260306101500.png)