Skip to content

feat: audio recording #4439

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 18 additions & 4 deletions server/router/frontend/dist/index.html
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
<!DOCTYPE html>
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<link rel="apple-touch-icon" sizes="180x180" href="/apple-touch-icon.png" />
<link rel="icon" type="image/webp" href="/logo.webp" />
<link rel="manifest" href="/site.webmanifest" />
<meta name="theme-color" media="(prefers-color-scheme: light)" content="#f4f4f5" />
<meta name="theme-color" media="(prefers-color-scheme: dark)" content="#18181b" />
<meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no" />
<!-- memos.metadata.head -->
<title>Memos</title>
<script type="module" crossorigin src="/assets/index-BPauSa-q.js"></script>
<link rel="modulepreload" crossorigin href="/assets/mui-vendor-Bq4rR2hV.js">
<link rel="modulepreload" crossorigin href="/assets/utils-vendor-CofsvS4N.js">
<link rel="modulepreload" crossorigin href="/assets/mermaid-vendor-CTsb84w1.js">
<link rel="modulepreload" crossorigin href="/assets/katex-vendor-ChWnQ-fc.js">
<link rel="modulepreload" crossorigin href="/assets/leaflet-vendor-DFXsBYSp.js">
<link rel="stylesheet" crossorigin href="/assets/index-Dpyk-JdG.css">
</head>
<body>
No embeddable frontend found.
<body class="text-base w-full min-h-svh bg-zinc-50 dark:bg-zinc-900">
<div id="root" class="relative w-full min-h-full"></div>
<!-- memos.metadata.body -->
</body>
</html>
2 changes: 1 addition & 1 deletion web/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,4 @@
"esbuild"
]
}
}
}
289 changes: 289 additions & 0 deletions web/src/components/MemoEditor/ActionButton/RecordAudioButton.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
import { Button } from "@usememos/mui";
import { MicIcon, StopCircleIcon } from "lucide-react";
import { useCallback, useContext, useState, useRef } from "react";
import toast from "react-hot-toast";
import { resourceStore } from "@/store/v2";
import { Resource } from "@/types/proto/api/v1/resource_service";
import { useTranslate } from "@/utils/i18n";
import { MemoEditorContext } from "../types";

// 声明 Web Speech API 类型
interface ISpeechRecognition extends EventTarget {
continuous: boolean;
interimResults: boolean;
lang: string;
start(): void;
stop(): void;
abort(): void;
onstart: ((this: ISpeechRecognition, ev: Event) => any) | null;
onresult: ((this: ISpeechRecognition, ev: SpeechRecognitionEvent) => any) | null;
onerror: ((this: ISpeechRecognition, ev: SpeechRecognitionErrorEvent) => any) | null;
onend: ((this: ISpeechRecognition, ev: Event) => any) | null;
}

interface SpeechRecognitionEvent extends Event {
resultIndex: number;
results: SpeechRecognitionResultList;
}

interface SpeechRecognitionErrorEvent extends Event {
error: string;
}

declare global {
interface Window {
SpeechRecognition: new () => ISpeechRecognition;
webkitSpeechRecognition: new () => ISpeechRecognition;
}
}

const RecordAudioButton = () => {
const t = useTranslate();
const context = useContext(MemoEditorContext);
const [isRecording, setIsRecording] = useState(false);
const [mediaRecorder, setMediaRecorder] = useState<MediaRecorder | null>(null);
const [isTranscribing, setIsTranscribing] = useState(false);
const speechRecognitionRef = useRef<ISpeechRecognition | null>(null);

// 用于跟踪临时转写文本的状态
const interimTranscriptRef = useRef<string>('');
const finalTranscriptRef = useRef<string>('');
const insertPositionRef = useRef<number>(0);

// 检测浏览器是否支持语音识别
const isSpeechRecognitionSupported = () => {
return 'webkitSpeechRecognition' in window || 'SpeechRecognition' in window;
};

// 初始化语音识别
const initSpeechRecognition = useCallback(() => {
if (!isSpeechRecognitionSupported()) {
return null;
}

const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
const recognition = new SpeechRecognition();

recognition.continuous = true;
recognition.interimResults = true;
recognition.lang = 'zh-CN'; // 默认中文,可以根据需要调整

recognition.onstart = () => {
setIsTranscribing(true);
console.log('语音识别已开始');

// 记录开始位置
if (context.editorRef?.current) {
const editor = context.editorRef.current;
const currentContent = editor.getContent();
insertPositionRef.current = currentContent.length;

// 清空转写状态
interimTranscriptRef.current = '';
finalTranscriptRef.current = '';
}
};

recognition.onresult = (event: SpeechRecognitionEvent) => {
let interimTranscript = '';
let finalTranscript = '';

// 处理所有结果
for (let i = event.resultIndex; i < event.results.length; i++) {
const transcript = event.results[i][0].transcript;
if (event.results[i].isFinal) {
finalTranscript += transcript;
} else {
interimTranscript += transcript;
}
}

if (context.editorRef?.current) {
const editor = context.editorRef.current;
const currentContent = editor.getContent();

// 计算需要移除的旧文本长度
const oldTextLength = finalTranscriptRef.current.length + interimTranscriptRef.current.length;

// 如果有旧的转写文本,先移除它
if (oldTextLength > 0) {
const newContent = currentContent.slice(0, insertPositionRef.current) +
currentContent.slice(insertPositionRef.current + oldTextLength);
editor.setContent(newContent);
}

// 更新转写状态
if (finalTranscript) {
finalTranscriptRef.current += finalTranscript;
}
interimTranscriptRef.current = interimTranscript;

// 插入新的转写文本
const newTranscriptText = finalTranscriptRef.current + interimTranscript;
if (newTranscriptText) {
const contentBeforeInsert = editor.getContent();
let textToInsert = newTranscriptText;

// 在插入位置添加适当的空格
if (insertPositionRef.current > 0 &&
contentBeforeInsert[insertPositionRef.current - 1] &&
!contentBeforeInsert[insertPositionRef.current - 1].match(/[\s\n]/)) {
textToInsert = ' ' + textToInsert;
}

// 插入文本
const newContent = contentBeforeInsert.slice(0, insertPositionRef.current) +
textToInsert +
contentBeforeInsert.slice(insertPositionRef.current);
editor.setContent(newContent);

// 设置光标位置到文本末尾
const cursorPosition = insertPositionRef.current + textToInsert.length;
editor.setCursorPosition(cursorPosition);
}
}
};

recognition.onerror = (event: SpeechRecognitionErrorEvent) => {
console.error('语音识别错误:', event.error);
if (event.error === 'not-allowed') {
toast.error(t("message.microphone-not-available"));
} else {
toast.error(`语音识别错误: ${event.error}`);
}
};

recognition.onend = () => {
setIsTranscribing(false);
console.log('语音识别已结束');

// 清空转写状态
interimTranscriptRef.current = '';
finalTranscriptRef.current = '';
};

return recognition;
}, [t, context]);

// 检测浏览器支持的音频格式
const getSupportedMimeType = () => {
const types = ["audio/webm", "audio/mp4", "audio/aac", "audio/wav", "audio/ogg"];

for (const type of types) {
if (MediaRecorder.isTypeSupported(type)) {
return type;
}
}
return null;
};

const startRecording = useCallback(async () => {
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });

const mimeType = getSupportedMimeType();
if (!mimeType) {
throw new Error("No supported audio format found");
}

const recorder = new MediaRecorder(stream, {
mimeType: mimeType,
});

const chunks: BlobPart[] = [];

recorder.ondataavailable = (e) => chunks.push(e.data);
recorder.onstop = async () => {
const blob = new Blob(chunks, { type: mimeType });
const buffer = new Uint8Array(await blob.arrayBuffer());

// 根据不同的 mimeType 选择合适的文件扩展名
const getFileExtension = (mimeType: string) => {
switch (mimeType) {
case "audio/webm":
return "webm";
case "audio/mp4":
return "m4a";
case "audio/aac":
return "aac";
case "audio/wav":
return "wav";
case "audio/ogg":
return "ogg";
default:
return "webm";
}
};

try {
const resource = await resourceStore.createResource({
resource: Resource.fromPartial({
filename: `recording-${new Date().getTime()}.${getFileExtension(mimeType)}`,
type: mimeType,
size: buffer.length,
content: buffer,
}),
});
context.setResourceList([...context.resourceList, resource]);

// 录音完成提示
toast.success(`录音和转写已完成`);
} catch (error: any) {
console.error(error);
toast.error(error.details);
}

stream.getTracks().forEach((track) => track.stop());
};

// 每秒记录一次数据
recorder.start(1000);
setMediaRecorder(recorder);
setIsRecording(true);

// 开始语音识别
if (isSpeechRecognitionSupported()) {
const recognition = initSpeechRecognition();
if (recognition) {
speechRecognitionRef.current = recognition;
recognition.start();
}
} else {
toast.error("您的浏览器不支持语音识别功能");
}

} catch (error) {
console.error(error);
toast.error(t("message.microphone-not-available"));
}
}, [context, resourceStore, t, initSpeechRecognition]);

const stopRecording = useCallback(() => {
if (mediaRecorder) {
mediaRecorder.stop();
setMediaRecorder(null);
setIsRecording(false);
}

// 停止语音识别
if (speechRecognitionRef.current) {
speechRecognitionRef.current.stop();
speechRecognitionRef.current = null;
}

setIsTranscribing(false);
}, [mediaRecorder]);

return (
<Button
className={`p-0 relative ${isTranscribing ? 'text-green-500' : ''}`}
size="sm"
variant="plain"
onClick={isRecording ? stopRecording : startRecording}
>
{isRecording ? <StopCircleIcon className="w-5 h-5 mx-auto text-red-500" /> : <MicIcon className="w-5 h-5 mx-auto" />}
</Button>
);
};

export default RecordAudioButton;
3 changes: 3 additions & 0 deletions web/src/components/MemoEditor/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import DateTimeInput from "../DateTimeInput";
import AddMemoRelationPopover from "./ActionButton/AddMemoRelationPopover";
import LocationSelector from "./ActionButton/LocationSelector";
import MarkdownMenu from "./ActionButton/MarkdownMenu";
import RecordAudioButton from "./ActionButton/RecordAudioButton";
import TagSelector from "./ActionButton/TagSelector";
import UploadResourceButton from "./ActionButton/UploadResourceButton";
import VisibilitySelector from "./ActionButton/VisibilitySelector";
Expand Down Expand Up @@ -475,6 +476,7 @@ const MemoEditor = observer((props: Props) => {
}));
},
memoName,
editorRef,
}}
>
<div
Expand All @@ -499,6 +501,7 @@ const MemoEditor = observer((props: Props) => {
<RelationListView relationList={referenceRelations} setRelationList={handleSetRelationList} />
<div className="relative w-full flex flex-row justify-between items-center py-1" onFocus={(e) => e.stopPropagation()}>
<div className="flex flex-row justify-start items-center opacity-80 dark:opacity-60 space-x-2">
<RecordAudioButton />
<TagSelector editorRef={editorRef} />
<MarkdownMenu editorRef={editorRef} />
<UploadResourceButton isUploadingResource={state.isUploadingResource} />
Expand Down
2 changes: 2 additions & 0 deletions web/src/components/MemoEditor/types/context.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import { createContext } from "react";
import { MemoRelation } from "@/types/proto/api/v1/memo_service";
import { Resource } from "@/types/proto/api/v1/resource_service";
import { EditorRefActions } from "../Editor";

interface Context {
resourceList: Resource[];
relationList: MemoRelation[];
setResourceList: (resourceList: Resource[]) => void;
setRelationList: (relationList: MemoRelation[]) => void;
memoName?: string;
editorRef?: React.RefObject<EditorRefActions>;
}

export const MemoEditorContext = createContext<Context>({
Expand Down
2 changes: 1 addition & 1 deletion web/src/components/MemoResource.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ const MemoResource: React.FC<Props> = (props: Props) => {
return (
<div className={`w-auto flex flex-row justify-start items-center text-gray-500 dark:text-gray-400 hover:opacity-80 ${className}`}>
{resource.type.startsWith("audio") ? (
<audio src={resourceUrl} controls></audio>
<audio src={resourceUrl} controls className="max-w-full" controlsList="nodownload" />
) : (
<>
<ResourceIcon className="w-4! h-4! mr-1" resource={resource} />
Expand Down
3 changes: 2 additions & 1 deletion web/src/locales/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,8 @@
"restored-successfully": "Restored successfully",
"succeed-copy-link": "Link copied successfully.",
"update-succeed": "Update succeeded",
"user-not-found": "User not found"
"user-not-found": "User not found",
"microphone-not-available": "Cannot access microphone"
},
"reference": {
"add-references": "Add references",
Expand Down
3 changes: 2 additions & 1 deletion web/src/locales/zh-Hans.json
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,8 @@
"restored-successfully": "恢复成功",
"succeed-copy-link": "复制链接到剪贴板成功。",
"update-succeed": "更新成功",
"user-not-found": "未找到该用户"
"user-not-found": "未找到该用户",
"microphone-not-available": "无法访问麦克风"
},
"reference": {
"add-references": "添加引用",
Expand Down