danny-avila · berry-13 · Aug 4, 2023 · Aug 4, 2023 · Aug 4, 2023 · Aug 5, 2023
diff --git a/.env.example b/.env.example
@@ -192,6 +192,14 @@ MEILI_NO_ANALYTICS=true
 MEILI_HOST=http://0.0.0.0:7700
 MEILI_MASTER_KEY=DrhYf7zENyR6AlUCKmnz0eYASOQdl6zxH7s7MKFSfFCt
 
+
+#==================================================#
+#          Speech to Text & Text to Speech         #
+#==================================================#
+
+WHISPER_API_KEY=
+ELEVENLABS_API_KEY=
+
 #===================================================#
 #                    User System                    #
 #===================================================#

diff --git a/api/server/routes/config.js b/api/server/routes/config.js
@@ -32,6 +32,7 @@ router.get('/', async function (req, res) {
         !!process.env.EMAIL_PASSWORD &&
         !!process.env.EMAIL_FROM,
       checkBalance: isEnabled(process.env.CHECK_BALANCE),
+      speechToTextExternal: !!process.env.WHISPER_API_KEY,
     };
 
     if (typeof process.env.CUSTOM_FOOTER === 'string') {

diff --git a/api/server/routes/files/index.js b/api/server/routes/files/index.js
@@ -12,12 +12,16 @@ const {
 const files = require('./files');
 const images = require('./images');
 const avatar = require('./avatar');
+const stt = require('./stt');
+const tts = require('./tts');
 
 router.use(requireJwtAuth);
 router.use(checkBan);
 router.use(uaParser);
 
 router.use('/', files);
+router.use('/stt', stt);
+router.use('/tts', tts);
 router.use('/images', images);
 router.use('/images/avatar', avatar);
 

diff --git a/api/server/routes/files/stt.js b/api/server/routes/files/stt.js
@@ -0,0 +1,17 @@
+const express = require('express');
+const router = express.Router();
+const { requireJwtAuth } = require('~/server/middleware/');
+const multer = require('multer');
+const { speechToTextLocal, speechToTextWhisper } = require('~/server/services/Files/Audio');
+
+const upload = multer();
+
+router.post('/', requireJwtAuth, upload.single('audio'), async (req, res) => {
+  if (process.env.WHISPER_LOCAL === 'true') {
+    await speechToTextLocal(req, res);
+  } else {
+    await speechToTextWhisper(req, res);
+  }
+});
+
+module.exports = router;
diff --git a/api/server/routes/files/tts.js b/api/server/routes/files/tts.js
@@ -0,0 +1,13 @@
+const express = require('express');
+const router = express.Router();
+const { requireJwtAuth } = require('~/server/middleware/');
+const textToSpeechLocal = require('~/server/services/Files/Audio');
+
+router.post('/', requireJwtAuth, async (req, res) => {
+  console.log('Received FormData');
+
+  const audioBuffer = await textToSpeechLocal(req, res);
+  res.send(audioBuffer);
+});
+
+module.exports = router;
diff --git a/api/server/services/Files/Audio/index.js b/api/server/services/Files/Audio/index.js
@@ -0,0 +1,9 @@
+const speechToTextLocal = require('./speechToTextLocal');
+const textToSpeechLocal = require('./textToSpeechLocal');
+const speechToTextWhisper = require('./speechToTextWhisper');
+
+module.exports = {
+  speechToTextLocal,
+  textToSpeechLocal,
+  speechToTextWhisper,
+};
diff --git a/api/server/services/Files/Audio/speechToTextLocal.js b/api/server/services/Files/Audio/speechToTextLocal.js
@@ -0,0 +1,39 @@
+const axios = require('axios');
+const FormData = require('form-data');
+const { Readable } = require('stream');
+
+async function speechToTextLocal(req, res) {
+  if (!req.file || !req.file.buffer) {
+    console.error('No audio file provided in the FormData');
+    return res.status(400).json({ message: 'No audio file provided in the FormData' });
+  }
+
+  const audioBuffer = req.file.buffer;
+
+  // Create a readable stream from the audio buffer
+  const audioReadStream = Readable.from(audioBuffer);
+  // Set the filename for mimeType detection
+  audioReadStream.path = 'audio.wav';
+
+  const formData = new FormData();
+  formData.append('file', audioReadStream, { filename: 'audio.wav', contentType: 'audio/wav' });
+  formData.append('model', 'whisper');
+
+  try {
+    // Make the POST request using axios
+    const response = await axios.post('http://localhost:8080/v1/audio/transcriptions', formData, {
+      headers: formData.getHeaders(),
+    });
+
+    if (response && response.status && response.data && response.data.text) {
+      const text = response.data.text.trim();
+      res.json({ text });
+    }
+  } catch (error) {
+    console.error(error);
+    console.error('Server response:', error.response.data);
+    res.status(500).json({ message: 'An error occurred while processing the audio' });
+  }
+}
+
+module.exports = speechToTextLocal;
diff --git a/api/server/services/Files/Audio/speechToTextWhisper.js b/api/server/services/Files/Audio/speechToTextWhisper.js
@@ -0,0 +1,37 @@
+const axios = require('axios');
+const FormData = require('form-data');
+
+async function speechToTextLocal(req, res) {
+  if (!req.file || !req.file.buffer) {
+    console.error('No audio file provided in the FormData');
+    return res.status(400).json({ message: 'No audio file provided in the FormData' });
+  }
+
+  const audioBuffer = req.file.buffer;
+  const audioBlob = new Blob([audioBuffer], { type: req.file.mimetype });
+
+  const formData = new FormData();
+  formData.append('file', audioBlob);
+  formData.append('model', 'whisper-1');
+
+  try {
+    // Make the POST request using axios
+    const response = await axios.post('https://api.openai.com/v1/audio/transcriptions', formData, {
+      headers: {
+        Authorization: `Bearer ${process.env.WHISPER_API_KEY}`,
+        'Content-Type': 'multipart/form-data',
+      },
+    });
+
+    if (response && response.status && response.data && response.data.text) {
+      const text = response.data.text.trim();
+      res.json({ text });
+    }
+  } catch (error) {
+    console.error(error);
+    console.error('Server response:', error.response.data);
+    res.status(500).json({ message: 'An error occurred while processing the audio' });
+  }
+}
+
+module.exports = speechToTextLocal;
diff --git a/api/server/services/Files/Audio/textToSpeechLocal.js b/api/server/services/Files/Audio/textToSpeechLocal.js
@@ -0,0 +1,15 @@
+// const { Buffer }  = require('buffer');
+
+async function textToSpeechLocal(req, res) {
+  const response = 'Test response';
+
+  console.log(req);
+
+  res.send('Test response');
+
+  // const mp3Buffer = Buffer.from(await response.arrayBuffer());
+
+  return response;
+}
+
+module.exports = textToSpeechLocal;
diff --git a/client/package.json b/client/package.json
@@ -52,6 +52,7 @@
     "downloadjs": "^1.4.7",
     "export-from-json": "^1.7.2",
     "filenamify": "^6.0.0",
+    "hotkeys-js": "^3.12.0",
     "html-to-image": "^1.11.11",
     "image-blob-reduce": "^4.1.0",
     "librechat-data-provider": "*",

diff --git a/client/src/components/Auth/LoginForm.tsx b/client/src/components/Auth/LoginForm.tsx
@@ -107,7 +107,8 @@ function LoginForm({ onSubmit }: TLoginFormProps) {
           aria-label="Sign in"
           data-testid="login-button"
           type="submit"
-          className="w-full transform rounded-md bg-green-500 px-4 py-3 tracking-wide text-white transition-colors duration-200 hover:bg-green-600 focus:bg-green-600 focus:outline-none">
+          className="w-full transform rounded-md bg-green-500 px-4 py-3 tracking-wide text-white transition-colors duration-200 hover:bg-green-600 focus:bg-green-600 focus:outline-none"
+        >
           {localize('com_auth_continue')}
         </button>
       </div>

diff --git a/client/src/components/Chat/Input/ChatForm.tsx b/client/src/components/Chat/Input/ChatForm.tsx
@@ -1,4 +1,5 @@
 import { useRecoilState } from 'recoil';
+import { useEffect } from 'react';
 import type { ChangeEvent } from 'react';
 import { useChatContext } from '~/Providers';
 import { useRequiresKey } from '~/hooks';
@@ -8,6 +9,8 @@ import SendButton from './SendButton';
 import Images from './Files/Images';
 import Textarea from './Textarea';
 import store from '~/store';
+import { useSpeechToText, useSpeechToTextExternal } from '~/hooks';
+import { useGetStartupConfig } from 'librechat-data-provider/react-query';
 
 export default function ChatForm({ index = 0 }) {
   const [text, setText] = useRecoilState(store.textByIndex(index));
@@ -32,6 +35,29 @@ export default function ChatForm({ index = 0 }) {
   const { requiresKey } = useRequiresKey();
   const { endpoint: _endpoint, endpointType } = conversation ?? { endpoint: null };
   const endpoint = endpointType ?? _endpoint;
+  const { data: startupConfig } = useGetStartupConfig();
+  const useExternalSpeech = startupConfig?.speechToTextExternal;
+
+  const {
+    isListening: speechIsListening,
+    isLoading: speechIsLoading,
+    text: speechText,
+  } = useSpeechToText();
+
+  const {
+    isListening: externalIsListening,
+    isLoading: externalIsLoading,
+    text: externalSpeechText,
+  } = useSpeechToTextExternal();
+
+  const isListening = useExternalSpeech ? externalIsListening : speechIsListening;
+  const isLoading = useExternalSpeech ? externalIsLoading : speechIsLoading;
+  const speechTextForm = useExternalSpeech ? externalSpeechText : speechText;
+  const finalText = speechText || externalSpeechText ? speechTextForm : text;
+
+  useEffect(() => {
+    return setText(finalText);
+  }, [finalText, setText]);
 
   return (
     <form
@@ -60,7 +86,12 @@ export default function ChatForm({ index = 0 }) {
               <StopButton stop={handleStopGenerating} setShowStopButton={setShowStopButton} />
             ) : (
               endpoint && (
-                <SendButton text={text} disabled={filesLoading || isSubmitting || requiresKey} />
+                <SendButton
+                  text={text}
+                  disabled={filesLoading || isSubmitting || requiresKey}
+                  isListening={isListening}
+                  isLoading={isLoading}
+                />
               )
             )}
           </div>

diff --git a/client/src/components/Chat/Input/SendButton.tsx b/client/src/components/Chat/Input/SendButton.tsx
@@ -1,19 +1,42 @@
-import { SendIcon } from '~/components/svg';
+import { SendIcon, ListeningIcon, Spinner } from '~/components/svg';
 import { cn } from '~/utils';
 
-export default function SendButton({ text, disabled }) {
+export default function SendButton({ text, disabled, isListening, isLoading }) {
   return (
-    <button
-      disabled={!text || disabled}
-      className={cn(
-        'absolute bottom-1.5 right-2 rounded-lg border border-black p-0.5 text-white transition-colors enabled:bg-black disabled:bg-black disabled:text-gray-400 disabled:opacity-10 dark:border-white dark:bg-white dark:hover:bg-gray-900 dark:disabled:bg-white dark:disabled:hover:bg-transparent md:bottom-3 md:right-3',
+    <>
+      {isListening ? (
+        <button
+          className="group absolute bottom-0 right-0 z-[101] flex h-[100%] w-[50px] items-center justify-center bg-transparent p-1 text-gray-500"
+          disabled={true}
+        >
+          <span className="" data-state="closed">
+            <ListeningIcon />
+          </span>
+        </button>
+      ) : isLoading ? (
+        <button
+          className="group absolute bottom-0 right-0 z-[101] flex h-[100%] w-[50px] items-center justify-center bg-transparent p-1 text-gray-500"
+          disabled={true}
+        >
+          <span className="" data-state="closed">
+            <Spinner className="icon-sm m-auto text-white" />
+          </span>
+        </button>
+      ) : (
+        <button
+          disabled={!text || disabled}
+          className={cn(
+            'absolute rounded-lg rounded-md border border-black p-0.5 p-1 text-white transition-colors enabled:bg-black disabled:bg-black disabled:text-gray-400 disabled:opacity-10 dark:border-white dark:bg-white enabled:dark:bg-white dark:disabled:bg-white ',
+            'bottom-1.5 right-1.5 md:bottom-2.5 md:right-3 md:p-[2px]',
+          )}
+          data-testid="send-button"
+          type="submit"
+        >
+          <span className="" data-state="closed">
+            <SendIcon size={24} />
+          </span>
+        </button>
       )}
-      data-testid="send-button"
-      type="submit"
-    >
-      <span className="" data-state="closed">
-        <SendIcon size={24} />
-      </span>
-    </button>
+    </>
   );
 }
diff --git a/client/src/components/Chat/Messages/HoverButtons.tsx b/client/src/components/Chat/Messages/HoverButtons.tsx
@@ -1,8 +1,18 @@
 import { useState } from 'react';
+import { useRecoilState } from 'recoil';
 import type { TConversation, TMessage } from 'librechat-data-provider';
-import { Clipboard, CheckMark, EditIcon, RegenerateIcon, ContinueIcon } from '~/components/svg';
-import { useGenerationsByLatest, useLocalize } from '~/hooks';
+import {
+  Clipboard,
+  CheckMark,
+  EditIcon,
+  RegenerateIcon,
+  ContinueIcon,
+  VolumeIcon,
+  VolumeMuteIcon,
+} from '~/components/svg';
+import { useGenerationsByLatest, useLocalize, useTextToSpeech } from '~/hooks';
 import { cn } from '~/utils';
+import store from '~/store';
 
 type THoverButtons = {
   isEditing: boolean;
@@ -31,6 +41,10 @@ export default function HoverButtons({
   const { endpoint: _endpoint, endpointType } = conversation ?? {};
   const endpoint = endpointType ?? _endpoint;
   const [isCopied, setIsCopied] = useState(false);
+  const [isSpeaking, setIsSpeaking] = useState(false);
+  const { synthesizeSpeech, cancelSpeech } = useTextToSpeech();
+  const [TextToSpeech] = useRecoilState<boolean>(store.TextToSpeech);
+
   const { hideEditButton, regenerateEnabled, continueSupported } = useGenerationsByLatest({
     isEditing,
     isSubmitting,
@@ -51,8 +65,28 @@ export default function HoverButtons({
     enterEdit();
   };
 
+  const toggleSpeech = () => {
+    if (isSpeaking) {
+      cancelSpeech();
+      setIsSpeaking(false);
+    } else {
+      synthesizeSpeech(message?.text ?? '', () => setIsSpeaking(false));
+    }
+    setIsSpeaking(!isSpeaking);
+  };
+
   return (
     <div className="visible mt-0 flex justify-center gap-1 self-end text-gray-400 lg:justify-start">
+      {TextToSpeech && (
+        <button
+          className="hover-button rounded-md p-1 pl-0 text-gray-400 hover:text-gray-950 dark:text-gray-400/70 dark:hover:text-gray-200 disabled:dark:hover:text-gray-400 md:group-hover:visible md:group-[.final-completion]:visible"
+          onClick={toggleSpeech}
+          type="button"
+          title={isSpeaking ? localize('com_ui_stop_speaking') : localize('com_ui_speak')}
+        >
+          {isSpeaking ? <VolumeMuteIcon /> : <VolumeIcon />}
+        </button>
+      )}
       <button
         className={cn(
           'hover-button rounded-md p-1 pl-0 text-gray-400 hover:text-gray-950 dark:text-gray-400/70 dark:hover:text-gray-200 disabled:dark:hover:text-gray-400 md:group-hover:visible md:group-[.final-completion]:visible',

diff --git a/client/src/components/Input/Generations/Regenerate.tsx b/client/src/components/Input/Generations/Regenerate.tsx
@@ -9,7 +9,7 @@ export default function Regenerate({ onClick }: TGenButtonProps) {
   return (
     <Button onClick={onClick}>
       <RegenerateIcon className="h-3 w-3 flex-shrink-0 text-gray-600/90 dark:text-gray-400" />
-	  {localize('com_ui_regenerate')}
+      {localize('com_ui_regenerate')}
     </Button>
   );
 }
diff --git a/client/src/components/Input/Generations/Stop.tsx b/client/src/components/Input/Generations/Stop.tsx
@@ -9,7 +9,7 @@ export default function Stop({ onClick }: TGenButtonProps) {
   return (
     <Button type="stop" onClick={onClick}>
       <StopGeneratingIcon className="text-gray-600/90 dark:text-gray-400 " />
-	  {localize('com_ui_stop')}
+      {localize('com_ui_stop')}
     </Button>
   );
 }