Text generation with the options of Qwen and Llama Instruct models

#2
by frimelle HF Staff - opened
.gitattributes CHANGED
@@ -33,5 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- voice_consent_gate.png filter=lfs diff=lfs merge=lfs -text
37
- assets/voice_consent_gate.png filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
app.py CHANGED
@@ -1,27 +1,26 @@
1
  import gradio as gr
2
-
3
  from gradio_client import Client, handle_file
4
 
5
  import src.generate as generate
6
  import src.process as process
7
 
 
8
  global client
9
 
10
- GATE_IMAGE_PATH = "./assets/voice_consent_gate_50.png"
11
-
12
  # TODO: Ideally, instead of the Client method we're using for an external voice cloning app, we use the .load() function and pass in arguments to it directly while displaying the developer's desired UI.
13
  #chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
14
  # ------------------- UI printing functions -------------------
15
  def clear_all():
16
- # target, user_transcript, score_html, result_html, diff_html, tts_ui
17
- return "", "", "", "", "", gr.Row.update(visible=False)
 
18
 
19
 
20
  def make_result_html(pass_threshold, passed, ratio):
21
  """Returns HTML summarizing results.
22
  Parameters:
23
- pass_threshold: Minimum percentage of match between target and
24
- recognized user utterance that counts as passing.
25
  passed: Whether the recognized user utterance is >= `pass_threshold`.
26
  ratio: Sequence match ratio.
27
  """
@@ -80,16 +79,16 @@ def make_html(sentence_match):
80
  return score_html, result_html, diff_html
81
 
82
 
83
- # ------------------- Core Check (Currently English-only) -------------------
84
  # @spaces.GPU
85
  def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
86
- asr_model_id: str, device_pref: str) -> (str, str):
87
  """ASR for the input audio and basic validation.
88
- Uses the selected ASR model `asr_model_id` to recognize words in the input `audio_path`.
89
  Parameters:
90
  audio_path: Processed audio file returned from gradio Audio component.
91
  target_sentence: Sentence the user needs to say.
92
- asr_model_id: Desired ASR model.
93
  device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
94
  Returns:
95
  error_msg: If there's an error, a string describing what happened.
@@ -103,7 +102,7 @@ def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
103
  return "Please start, record, then stop the audio recording before trying to transcribe.", ""
104
 
105
  # Runs the automatic speech recognition
106
- user_transcript = process.run_asr(audio_path, asr_model_id, device_pref)
107
 
108
  # Handles processing errors.
109
  if isinstance(user_transcript, Exception):
@@ -111,13 +110,13 @@ def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
111
  return "", user_transcript
112
 
113
 
114
- def transcribe_check(audio_path, target_sentence, asr_model_id, device_pref,
115
  pass_threshold):
116
  """Transcribe user, calculate match to target sentence, create results HTML.
117
  Parameters:
118
  audio_path: Local path to recorded audio.
119
  target_sentence: Sentence the user needs to say.
120
- asr_model_id: Desired ASR model.
121
  device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
122
  Returns:
123
  user_transcript: The recognized user utterance
@@ -129,8 +128,7 @@ def transcribe_check(audio_path, target_sentence, asr_model_id, device_pref,
129
  clone_audio = False
130
  # Transcribe user input
131
  error_msg, user_transcript = get_user_transcript(audio_path,
132
- target_sentence,
133
- asr_model_id,
134
  device_pref)
135
  if error_msg:
136
  score_html = ""
@@ -146,92 +144,63 @@ def transcribe_check(audio_path, target_sentence, asr_model_id, device_pref,
146
  # Create the output to print out
147
  score_html, result_html, diff_html = make_html(sentence_match)
148
 
149
- return (user_transcript, score_html, result_html, diff_html,
150
- gr.Row(visible=clone_audio))
151
 
152
- def clone_voice(audio_input, text_input, exaggeration_input, cfgw_input,
153
- seed_num_input, temperature_input):
154
  global client
155
  # Additional specifications for Chatterbox include:
156
  # exaggeration_input=0.5,
157
  # temperature_input=0.8,
158
- # seed_num_input=0,z
159
  # cfgw_input=0.5,
160
  # api_name="/generate_tts_audio"
161
  return client.predict(text_input=text_input,
162
- audio_prompt_path_input=handle_file(audio_input),
163
- exaggeration_input=exaggeration_input,
164
- cfgw_input=cfgw_input,
165
- seed_num_input=seed_num_input,
166
- temperature_input=temperature_input)
167
 
168
 
169
  # ------------------- UI -------------------
170
- with gr.Blocks(title="Voice Consent Gate") as demo:
171
- gr.Markdown("# Voice Consent Gate: Demo")
172
- with gr.Row():
173
- with gr.Column():
174
- gr.Image(GATE_IMAGE_PATH, interactive=False, show_download_button=False)
175
- with gr.Column():
176
- with gr.Accordion(
177
- label="Click for further information on this demo",
178
- open=False):
179
- gr.Markdown("""
180
- To create a basic voice cloning system with a voice consent gate, you need three parts:
181
- 1. A way of generating novel consent sentences for the person whose voice will be cloned – the “speaker” – to say, uniquely referencing the current consent context.
182
- 2. An _automatic speech recognition (ASR) system_ that recognizes the sentence conveying consent.
183
- 3. A _voice-cloning text-to-speech (TTS) system_ that takes as input text and the speaker's speech snippets to generate speech.
184
-
185
- Since some voice-cloning TTS systems can now generate speech similar to a speaker’s voice using _just one sentence_, a sentence used for consent can **also** be used for voice cloning.
186
- """)
187
- with gr.Row():
188
- with gr.Column(scale=2):
189
- gr.Markdown(
190
- """# 🎤 Say the Sentence (English)"""
191
- )
192
- gr.Markdown(
193
- """
194
- ## 1) Generate a sentence.
195
- ## 2) Record yourself reading it.
196
- ## 3) Transcribe & check your accuracy.
197
- ## 4) If matched, clone your voice to speak any sentence you enter.
198
- """
199
- )
200
- with gr.Column():
201
- consent_method = gr.Dropdown(
202
- label="Sentence generation method (currently limited to Llama 3.2 3B Instruct)",
203
- choices=["Llama 3.2 3B Instruct"],
204
- value="Llama 3.2 3B Instruct"
205
- )
206
- asr_model = gr.Dropdown(label="Speech recognition model (currently limited to Whisper)",
207
- choices=["openai/whisper-tiny.en", # fastest (CPU-friendly)
208
- "openai/whisper-base.en", # better accuracy, a bit slower
209
- "distil-whisper/distil-small.en"
210
- # optional distil English model
211
- ],
212
- value="openai/whisper-tiny.en",
213
- )
214
- voice_clone_model = gr.Dropdown(
215
- label="Voice cloning model (currently limited to Chatterbox)",
216
- choices=["Chatterbox", ], value="Chatterbox")
217
  with gr.Row():
218
  target = gr.Textbox(label="Target sentence", interactive=False,
219
  placeholder="Click 'Generate sentence'")
 
220
  with gr.Row():
221
  btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
222
  btn_clear = gr.Button("🧹 Clear")
 
223
  with gr.Row():
224
- consent_audio = gr.Audio(sources=["microphone"], type="filepath",
225
- label="Record your voice", key='consent_audio')
226
- with gr.Accordion("Advanced ASR settings", open=False):
 
 
 
 
 
 
 
 
 
 
227
  device_pref = gr.Radio(
228
  choices=["auto", "cpu", "cuda"],
229
  value="auto",
230
  label="Device preference"
231
  )
232
- # In your own code, do not provide users with the option to change this: Set it yourself.
233
  pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01,
234
  label="Match threshold")
 
235
  with gr.Row():
236
  btn_check = gr.Button("✅ Transcribe & Check", variant="primary")
237
  with gr.Row():
@@ -242,66 +211,47 @@ with gr.Blocks(title="Voice Consent Gate") as demo:
242
  diff_html = gr.HTML(
243
  label="Word-level diff (red = expected but missing / green = extra or replacement)")
244
 
245
- gr.Markdown("## 🔁 Voice Consent Gate (opens upon consent)")
246
  # TODO: Ideally this is gr.Blocks, but that seems to have a visibility-change bug.
247
  with gr.Row(visible=False) as tts_ui:
248
- # Using the render decorator so that we can access consent audio after it's recorded.
249
  @gr.render(inputs=consent_audio)
250
  def show_tts(audio_input):
 
251
  global client
252
  if audio_input:
253
  client = Client("ResembleAI/Chatterbox")
 
 
254
  with gr.Row():
255
  with gr.Column():
256
  gr.Markdown("## Audio input")
257
  # Prepopulating with the consent audio.
258
- # Setting interactive=False keeps it from being possible to upload something else.
259
- tts_audio = gr.Audio(audio_input, type="filepath", interactive=False)
260
  with gr.Row():
261
  with gr.Column():
262
  gr.Markdown("## Text input")
263
  tts_text = gr.Textbox(
264
  "Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.", interactive=True)
265
- with gr.Row():
266
- # TODO: Ideally, these options aren't hardcoded -- e.g., using .load(), where they're imported, allowing for different options depending on the client.
267
- with gr.Accordion("More options", open=False):
268
- exaggeration = gr.Slider(
269
- 0.25, 2, step=.05,
270
- label="Exaggeration (Neutral = 0.5, extreme values can be unstable)",
271
- value=.5
272
- )
273
- cfg_weight = gr.Slider(
274
- 0.2, 1, step=.05, label="CFG/Pace", value=0.5
275
- )
276
- seed_num = gr.Number(value=0,
277
- label="Random seed (0 for random)")
278
- temp = gr.Slider(0.05, 5, step=.05,
279
- label="Temperature", value=.8)
280
  with gr.Row():
281
  clone_btn = gr.Button("Clone!")
282
- cloned_audio = gr.Audio(show_download_button=True)
283
- clone_btn.click(fn=clone_voice,
284
- inputs=[tts_audio, tts_text, exaggeration,
285
- cfg_weight, seed_num, temp],
286
- outputs=[cloned_audio])
287
 
288
  # -------- Events --------
289
- # Generate sentence: including model name + detailed prompt
290
- btn_gen.click(
291
- fn=generate.gen_sentence,
292
- inputs=[consent_method, voice_clone_model],
293
- outputs=target
294
- )
295
 
 
296
  btn_clear.click(
297
  fn=clear_all,
298
- outputs=[target, user_transcript, score_html, result_html, diff_html,
299
- tts_ui]
300
  )
301
 
302
  btn_check.click(
303
  fn=transcribe_check,
304
- inputs=[consent_audio, target, asr_model, device_pref, pass_threshold],
305
  outputs=[user_transcript, score_html, result_html, diff_html, tts_ui]
306
  )
307
 
 
1
  import gradio as gr
2
+ # import spaces
3
  from gradio_client import Client, handle_file
4
 
5
  import src.generate as generate
6
  import src.process as process
7
 
8
+ # TODO: Abusing the 'global' notation for now so we can be flexible to multiple clients.
9
  global client
10
 
 
 
11
  # TODO: Ideally, instead of the Client method we're using for an external voice cloning app, we use the .load() function and pass in arguments to it directly while displaying the developer's desired UI.
12
  #chatterbox_space = gr.load("spaces/ResembleAI/Chatterbox")
13
  # ------------------- UI printing functions -------------------
14
  def clear_all():
15
+ # target, user_transcript, score_html, diff_html, result_html,
16
+ # TODO(?): Add tts_text, tts_audio, clone_status (Maybe? Was there before.)
17
+ return "", "", "", "", "", "", "", None,
18
 
19
 
20
  def make_result_html(pass_threshold, passed, ratio):
21
  """Returns HTML summarizing results.
22
  Parameters:
23
+ pass_threshold: Minimum percentage of match between target and recognized user utterance that counts as passing.
 
24
  passed: Whether the recognized user utterance is >= `pass_threshold`.
25
  ratio: Sequence match ratio.
26
  """
 
79
  return score_html, result_html, diff_html
80
 
81
 
82
+ # ------------------- Core Check (English-only) -------------------
83
  # @spaces.GPU
84
  def get_user_transcript(audio_path: gr.Audio, target_sentence: str,
85
+ model_id: str, device_pref: str) -> (str, str):
86
  """ASR for the input audio and basic validation.
87
+ Uses the selected ASR model `model_id` to recognize words in the input `audio_path`.
88
  Parameters:
89
  audio_path: Processed audio file returned from gradio Audio component.
90
  target_sentence: Sentence the user needs to say.
91
+ model_id: Desired ASR model.
92
  device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
93
  Returns:
94
  error_msg: If there's an error, a string describing what happened.
 
102
  return "Please start, record, then stop the audio recording before trying to transcribe.", ""
103
 
104
  # Runs the automatic speech recognition
105
+ user_transcript = process.run_asr(audio_path, model_id, device_pref)
106
 
107
  # Handles processing errors.
108
  if isinstance(user_transcript, Exception):
 
110
  return "", user_transcript
111
 
112
 
113
+ def transcribe_check(audio_path, target_sentence, model_id, device_pref,
114
  pass_threshold):
115
  """Transcribe user, calculate match to target sentence, create results HTML.
116
  Parameters:
117
  audio_path: Local path to recorded audio.
118
  target_sentence: Sentence the user needs to say.
119
+ model_id: Desired ASR model.
120
  device_pref: Preferred ASR processing device. Can be "auto", "cpu", "cuda".
121
  Returns:
122
  user_transcript: The recognized user utterance
 
128
  clone_audio = False
129
  # Transcribe user input
130
  error_msg, user_transcript = get_user_transcript(audio_path,
131
+ target_sentence, model_id,
 
132
  device_pref)
133
  if error_msg:
134
  score_html = ""
 
144
  # Create the output to print out
145
  score_html, result_html, diff_html = make_html(sentence_match)
146
 
147
+ return user_transcript, score_html, result_html, diff_html, gr.Row(visible=clone_audio)
 
148
 
149
+ def clone_voice(audio_input, text_input):
150
+ # TODO: Note that this is the 'global' hack to pass in the client.
151
  global client
152
  # Additional specifications for Chatterbox include:
153
  # exaggeration_input=0.5,
154
  # temperature_input=0.8,
155
+ # seed_num_input=0,
156
  # cfgw_input=0.5,
157
  # api_name="/generate_tts_audio"
158
  return client.predict(text_input=text_input,
159
+ audio_prompt_path_input=handle_file(audio_input))
 
 
 
 
160
 
161
 
162
  # ------------------- UI -------------------
163
+ with gr.Blocks(title="Say the Sentence (English)") as demo:
164
+ gr.Markdown(
165
+ """
166
+ # 🎤 Say the Sentence (English)
167
+ 1) Generate a sentence.
168
+ 2) Record yourself reading it.
169
+ 3) Transcribe & check your accuracy.
170
+ 4) If matched, clone your voice to speak any sentence you enter.
171
+ """
172
+ )
173
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  with gr.Row():
175
  target = gr.Textbox(label="Target sentence", interactive=False,
176
  placeholder="Click 'Generate sentence'")
177
+
178
  with gr.Row():
179
  btn_gen = gr.Button("🎲 Generate sentence", variant="primary")
180
  btn_clear = gr.Button("🧹 Clear")
181
+
182
  with gr.Row():
183
+ consent_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record your voice", key='consent_audio')
184
+
185
+ with gr.Accordion("Advanced settings", open=False):
186
+ model_id = gr.Dropdown(
187
+ choices=[
188
+ "openai/whisper-tiny.en", # fastest (CPU-friendly)
189
+ "openai/whisper-base.en", # better accuracy, a bit slower
190
+ "distil-whisper/distil-small.en" # optional distil English model
191
+ "distil-whisper/distil-small.en",
192
+ ],
193
+ value="openai/whisper-tiny.en",
194
+ label="ASR model (English only)",
195
+ )
196
  device_pref = gr.Radio(
197
  choices=["auto", "cpu", "cuda"],
198
  value="auto",
199
  label="Device preference"
200
  )
 
201
  pass_threshold = gr.Slider(0.50, 1.00, value=0.85, step=0.01,
202
  label="Match threshold")
203
+
204
  with gr.Row():
205
  btn_check = gr.Button("✅ Transcribe & Check", variant="primary")
206
  with gr.Row():
 
211
  diff_html = gr.HTML(
212
  label="Word-level diff (red = expected but missing / green = extra or replacement)")
213
 
 
214
  # TODO: Ideally this is gr.Blocks, but that seems to have a visibility-change bug.
215
  with gr.Row(visible=False) as tts_ui:
216
+ # Using the render decorator so that we can easily pass in the consent audio after it's recorded.
217
  @gr.render(inputs=consent_audio)
218
  def show_tts(audio_input):
219
+ # TODO: Abusing global, since we can't send a Client as a component to a function.
220
  global client
221
  if audio_input:
222
  client = Client("ResembleAI/Chatterbox")
223
+ with gr.Row():
224
+ gr.Markdown("# 🔁 Voice cloning")
225
  with gr.Row():
226
  with gr.Column():
227
  gr.Markdown("## Audio input")
228
  # Prepopulating with the consent audio.
229
+ tts_audio = gr.Audio(audio_input, interactive=True, type="filepath")
 
230
  with gr.Row():
231
  with gr.Column():
232
  gr.Markdown("## Text input")
233
  tts_text = gr.Textbox(
234
  "Now let's make my mum's favourite. So three mars bars into the pan. Then we add the tuna and just stir for a bit, just let the chocolate and fish infuse. A sprinkle of olive oil and some tomato ketchup. Now smell that. Oh boy this is going to be incredible.", interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  with gr.Row():
236
  clone_btn = gr.Button("Clone!")
237
+ cloned_audio = gr.Audio()
238
+ clone_btn.click(fn=clone_voice, inputs=[tts_audio, tts_text], outputs=[cloned_audio])
 
 
 
239
 
240
  # -------- Events --------
241
+ # Use pre-specified sentence bank by default
242
+ btn_gen.click(fn=generate.gen_sentence_set, outputs=target)
243
+ # Or use LLM generation:
244
+ # btn_gen.click(fn=generate.gen_sentence_llm, outputs=target)
 
 
245
 
246
+ # TODO(?): clearing tts_text, tts_audio, clone_status (not sure what that was)
247
  btn_clear.click(
248
  fn=clear_all,
249
+ outputs=[target, user_transcript, score_html, result_html, diff_html]
 
250
  )
251
 
252
  btn_check.click(
253
  fn=transcribe_check,
254
+ inputs=[consent_audio, target, model_id, device_pref, pass_threshold],
255
  outputs=[user_transcript, score_html, result_html, diff_html, tts_ui]
256
  )
257
 
assets/voice_consent_gate.png DELETED

Git LFS Details

  • SHA256: 1692551c8bace0152f60ef5039731e990e12b1429fdf004aefe328ef976d55b4
  • Pointer size: 131 Bytes
  • Size of remote file: 209 kB
assets/voice_consent_gate_50.png DELETED
Binary file (90 kB)
 
src/generate.py CHANGED
@@ -1,146 +1,45 @@
1
- # src/generate.py
2
- """
3
- Module: generate
4
- ----------------
5
- Handles the generation of "consent sentences" for the Voice Consent Gate demo.
6
 
7
- This module connects to an external language model (in this case, the public
8
- Hugging Face Space for Llama 3.2 3B Instruct) to generate natural-sounding
9
- sentences that users can read aloud to give informed consent for voice cloning.
10
-
11
- Functions:
12
- - _extract_llama_text(): Normalize the API output from the Llama demo.
13
- - gen_sentence(): Wrapper for gen_sentence_llm(); previously supported other options.
14
- - gen_sentence_llm(): Generate a consent sentence from the Llama model Space.
15
- """
16
-
17
- import os
18
- from typing import Any
19
- from gradio_client import Client
20
 
21
  import src.process as process
22
- from src.prompts import get_consent_generation_prompt
23
-
24
-
25
- # ------------------- Model / Space Configuration -------------------
26
- # The demo connects to the Llama 3.2 3B Instruct Space on Hugging Face.
27
- # You can override these defaults by setting environment variables in your Space.
28
- LLAMA_SPACE_ID = os.getenv(
29
- "LLAMA_SPACE_ID", "huggingface-projects/llama-3.2-3B-Instruct"
30
- )
31
- LLAMA_API_NAME = "/chat" # The Space exposes a single /chat endpoint.
32
- HF_TOKEN = os.getenv("HF_TOKEN") # Optional; not required for public Spaces.
33
-
34
-
35
- def _extract_llama_text(result: Any) -> str:
36
- """
37
- Normalize the API response from the Llama 3.2 3B demo Space into plain text.
38
-
39
- The Space’s `/chat` endpoint may return different shapes depending on how
40
- the Gradio app is structured — sometimes a string, other times a dictionary
41
- or list. This function recursively traverses and extracts the first
42
- meaningful text string it finds.
43
-
44
- Parameters
45
- result : The raw output returned by `client.predict()`.
46
-
47
- str : Cleaned text output (may be empty string if extraction fails).
48
- """
49
- if isinstance(result, str):
50
- return result.strip()
51
- if isinstance(result, (int, float, bool)):
52
- return str(result)
53
- if isinstance(result, list):
54
- # If multiple segments are returned (e.g., multiple sentences),
55
- # join them into one string.
56
- parts = []
57
- for x in result:
58
- s = _extract_llama_text(x)
59
- if s:
60
- parts.append(s)
61
- return " ".join(parts).strip()
62
- if isinstance(result, dict):
63
- # Common key names used in Gradio JSON responses
64
- for key in ("text", "response", "content", "generated_text", "message"):
65
- v = result.get(key)
66
- if isinstance(v, str) and v.strip():
67
- return v.strip()
68
- return ""
69
-
70
 
71
- def gen_sentence(consent_method="Llama 3.2 3B Instruct", voice_clone_model="Chatterbox"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  """
73
- Always generates a sentence via the LLM.
74
- Parameters
75
- consent_method: str
76
- The language model used to generate a consent sentence
77
- voice_clone_model: str
78
- The voice cloning model
79
- """
80
- try:
81
- return gen_sentence_llm(consent_method, voice_clone_model)
82
- except Exception as e:
83
- # Show a helpful message directly in the Target sentence box
84
- return f"[ERROR calling LLM] {type(e).__name__}: {e}"
85
-
86
- # TODO: Support more than just Llama 3.2 3B Instruct
87
- def gen_sentence_llm(consent_method="Llama 3.2 3B Instruct", voice_clone_model="Chatterbox") -> str:
88
- """
89
- Generate a consent sentence using the Llama 3.2 3B Instruct demo Space.
90
-
91
- This function constructs a prompt describing the linguistic and ethical
92
- requirements for a consent sentence (via `get_consent_generation_prompt`)
93
- and sends it to the Llama demo hosted on Hugging Face Spaces.
94
-
95
- The response is normalized into a single English sentence suitable
96
- for reading aloud.
97
- Parameters
98
- consent_method : str
99
- The name of the language model used to generate the consent utterance.
100
- Currently just implemented for Llama 3.2 3B Instruct.
101
- audio_model_name : str
102
- The name of the voice-cloning model to mention in the sentence.
103
- Defaults to "Chatterbox".
104
-
105
- Returns
106
- str
107
- A clean, human-readable consent sentence.
108
- """
109
- # Generate the full natural-language prompt that the LLM will receive
110
- prompt = get_consent_generation_prompt(voice_clone_model)
111
- space_id = LLAMA_SPACE_ID
112
- api_name = LLAMA_API_NAME
113
-
114
- try:
115
- # Currently always true.
116
- if consent_method != "Llama 3.2 3B Instruct":
117
- print("Not currently implemented for %s; using Llama 3.2 3B Instruct" % consent_method)
118
- # Initialize Gradio client for the language model Space
119
- client = Client(space_id, hf_token=HF_TOKEN)
120
-
121
- # The Llama demo exposes a simple /chat endpoint with standard decoding params
122
- result = client.predict(
123
- message=prompt,
124
- max_new_tokens=128,
125
- temperature=0.6,
126
- top_p=0.9,
127
- top_k=50,
128
- repetition_penalty=1.2,
129
- api_name=api_name,
130
- )
131
-
132
- # Normalize and clean up model output
133
- text = _extract_llama_text(result)
134
- text = process.normalize_text(text, lower=False)
135
-
136
- # Handle empty or malformed outputs
137
- if not text:
138
- raise ValueError("Empty response from Llama Space")
139
-
140
- # In case the model produces multiple lines or options, pick the first full sentence
141
- first_line = next((ln.strip() for ln in text.splitlines() if ln.strip()), "")
142
- return first_line or text
143
-
144
- except Exception as e:
145
- print(f"[gen_sentence_llm] Llama Space call failed: {type(e).__name__}: {e}")
146
- raise
 
1
+ import random
 
 
 
 
2
 
3
+ from transformers import pipeline, AutoTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  import src.process as process
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ # You can choose to use either:
8
+ # (1) a list of pre-specified sentences, in SENTENCE_BANK
9
+ # (2) an LLM-generated sentence.
10
+ # SENTENCE_BANK is used in the `gen_sentence_set` function.
11
+ # LLM generation is used in the `gen_sentence_llm` function.
12
+
13
+ # ------------------- Sentence Bank (customize freely) -------------------
14
+ SENTENCE_BANK = [
15
+ "The quick brown fox jumps over the lazy dog.",
16
+ "I promise to speak clearly and at a steady pace.",
17
+ "Open source makes AI more transparent and inclusive.",
18
+ "Hugging Face Spaces make demos easy to share.",
19
+ "Today the weather in Berlin is pleasantly cool.",
20
+ "Privacy and transparency should go hand in hand.",
21
+ "Please generate a new sentence for me to read.",
22
+ "Machine learning can amplify or reduce inequality.",
23
+ "Responsible AI requires participation from everyone.",
24
+ "This microphone test checks my pronunciation accuracy.",
25
+ ]
26
+
27
+
28
+ def gen_sentence_llm():
29
+ """Generates a sentence using an LLM.
30
+ Returns:
31
+ Normalized text string to display in the UI.
32
  """
33
+ prompt = ""
34
+ tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
35
+ generator = pipeline('text-generation', model='gpt2')
36
+ result = generator(prompt, stop_strings=[".", ], num_return_sequences=1,
37
+ tokenizer=tokenizer, pad_token_id=tokenizer.eos_token_id)
38
+ display_text = process.normalize_text(result[0]["generated_text"],
39
+ lower=False)
40
+ return display_text
41
+
42
+
43
+ def gen_sentence_set():
44
+ """Returns a sentence for the user to say using a prespecified set of options."""
45
+ return random.choice(SENTENCE_BANK)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/prompts.py CHANGED
@@ -1,59 +1,47 @@
1
- import random
2
 
3
- def get_consent_generation_prompt(audio_model_name: str) -> str:
4
  """
5
  Returns a text prompt instructing the model to generate a natural-sounding
6
  consent sentence for voice cloning with the specified model.
7
 
8
  Args:
9
  audio_model_name (str): Name of the audio model to mention in the prompt.
 
 
10
 
11
  Returns:
12
- str: The prompt text, with a randomized topic for the second sentence.
13
  """
14
 
15
- # Possible neutral or everyday topics to diversify phonetic variety
16
- topics = [
17
- "the weather",
18
- "daily routines",
19
- "travel or commuting",
20
- "food or cooking",
21
- "music",
22
- "nature or seasons",
23
- "time of day",
24
- "a calm place like a park or café",
25
- "light exercise or relaxation",
26
- "reading or learning something new",
27
- "a pleasant conversation with a friend",
28
- "observing surroundings like streets or sky",
29
- "working or focusing quietly"
30
- ]
31
-
32
- # Randomly choose one for this prompt instance
33
- topic = random.choice(topics)
34
 
35
  return f"""
36
- Generate exactly two short, natural-sounding English sentences (10-15 words each) that a person could say aloud, using everyday language.
37
-
38
- Sentence 1 (Consent sentence):
39
- * Clearly states informed consent to use their voice for generating synthetic audio with an AI model called {audio_model_name}.
40
- * Must explicitly include a consent phrase such as “I give my consent,” “I agree,” or “I allow.”
41
- * Must clearly mention the model name {audio_model_name} in the sentence.
42
- * Should sound fluent, polite, and natural to read aloud.
43
- * Should have a neutral or positive tone and be self-contained.
44
-
45
- Sentence 2 (Phonetic variety sentence):
46
- * Should not repeat the consent content.
47
- * Adds phonetic variety with a neutral descriptive clause, for example about {topic}.
48
- * Should be fluent, natural, and comfortable to read aloud.
49
- * Should sound polite and neutral, without emotional extremes.
50
- * Should include diverse vowels and consonants naturally for clear pronunciation.
51
-
52
- FORMAT:
53
- * Output EXACTLY two sentences.
54
- * No numbering, no quotes, no bullet points, and no introductory text.
55
- * Use standard punctuation.
56
-
57
- Example format (don’t copy text, just the format):
58
- I give my consent to use my voice for generating audio with the model {audio_model_name}. The weather is clear and calm this afternoon, and I’m speaking at an even pace.
59
- """
 
1
+ # src/utils/prompts.py
2
 
3
+ def get_consent_generation_prompt(audio_model_name: str, short_prompt: bool = False) -> str:
4
  """
5
  Returns a text prompt instructing the model to generate a natural-sounding
6
  consent sentence for voice cloning with the specified model.
7
 
8
  Args:
9
  audio_model_name (str): Name of the audio model to mention in the prompt.
10
+ short_prompt (bool): If True, returns a concise one-line prompt suitable
11
+ for direct model input. If False (default), returns the full detailed prompt.
12
 
13
  Returns:
14
+ str: The prompt text.
15
  """
16
 
17
+ if short_prompt:
18
+ return (
19
+ f"Generate one natural, spoken-style English sentence (10–20 words) in which a person "
20
+ f"clearly gives informed consent to use their voice for generating synthetic audio "
21
+ f"with the model {audio_model_name}. The sentence should sound conversational, include "
22
+ f"a clear consent phrase like 'I give my consent' or 'I agree', mention {audio_model_name} "
23
+ f"by name, and be phonetically varied but neutral in tone. Output only the final sentence."
24
+ )
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  return f"""
27
+ Generate a short, natural-sounding English sentence (10–20 words) that a person could say aloud
28
+ to clearly state their informed consent to use their voice for generating synthetic audio with
29
+ an AI model called {audio_model_name}.
30
+
31
+ The sentence should:
32
+ - Sound natural and conversational, not like legal text.
33
+ - Explicitly include a consent phrase, such as “I give my consent,” “I agree,” or “I allow.”
34
+ - Mention the model name ({audio_model_name}) clearly in the sentence.
35
+ - Include a neutral descriptive clause before or after the consent phrase to add phonetic variety
36
+ (e.g., “The weather today is bright and calm” or “This recording is made clearly and freely.”)
37
+ - Have a neutral or polite tone (no emotional extremes).
38
+ - Be comfortable to read aloud and phonetically rich, covering diverse vowels and consonants naturally.
39
+ - Be self-contained, so the full sentence can serve as an independent audio clip.
40
+
41
+ Examples of structure to follow:
42
+ - “The weather is clear and warm today. I give my consent to use my voice for generating audio with the model {audio_model_name}.”
43
+ - “I give my consent to use my voice for generating audio with the model {audio_model_name}. This statement is made freely and clearly.”
44
+ - “Good afternoon. I agree to the use of my recorded voice for audio generation with the model {audio_model_name}.”
45
+
46
+ The output should be a single, natural sentence ready to be spoken aloud for recording purposes.
47
+ """