Skip to content

Commit 8b1c4d2

Browse files
author
Логистик 21
committed
Initial commit: Add project files and README
0 parents  commit 8b1c4d2

File tree

863 files changed

+139021
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

863 files changed

+139021
-0
lines changed

README.md

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
## MP3 to Text Converter with Dialogue Role and Sentiment Analysis
2+
3+
## Project description
4+
5+
This Python script is designed to automatically process MP3 audio files and convert them to text format (.txt). It performs the following actions:
6+
7+
1. **Transcribe audio:** Uses OpenAI's Whisper model to recognize speech from audio files.
8+
2. **Dialog segmentation:** Applies inaSpeechSegmenter to divide audio into speech segments (male/female voice) and non-speech segments.
9+
3. **Role classification:** Uses RuBERT to determine the role of the speaker in a dialog (customer or salesperson).
10+
4. **Tone Analysis:** Applies TextBlob to analyze the tone of a text (polarity and subjectivity).
11+
5. **Saving results:** Transcribing, role and tonality analysis results are saved to text files (.txt) for each input MP3 file.
12+
13+
The script is designed to process dialog recordings, such as telephone conversations, for further analysis and text processing.
14+
15+
### Installation
16+
### Pre-requisites
17+
* **Python 3.7 or higher** (Python 3.8+ is recommended)
18+
* **FFmpeg** must be installed and added to the system PATH variable (required to convert MP3 to WAV). Instructions for installing FFmpeg depend on your operating system.
19+
20+
### Installing the Python libraries
21+
22+
All dependencies must be installed before running the script. It is recommended to use a virtual environment (`venv`) to isolate the project dependencies.
23+
24+
1.**Create a virtual environment (optional, but recommended):**
25+
26+
27+
```bash
28+
python -m venv venv
29+
```
30+
31+
2. **Activate the virtual environment:**
32+
**Windows:**
33+
34+
```bash
35+
venv\Scripts\activate
36+
```
37+
38+
**Linux/macOS:**
39+
40+
```bash
41+
source venv/bin/activate
42+
```
43+
3. ** Install the required libraries from the ``requirements.txt`` file:** ``bash source venv/bin/activate ``` 3.
44+
45+
`````bash
46+
pip install -r requirements.txt
47+
```
48+
(The `requirements.txt` file must be in the root folder of the project. Instructions for creating the `requirements.txt` file are below).

mp3_to_text.py

+209
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
import os
2+
import subprocess
3+
import torch
4+
import warnings
5+
from pathlib import Path
6+
from typing import Tuple, Dict, List
7+
8+
import whisper
9+
from inaSpeechSegmenter import Segmenter
10+
from pydub import AudioSegment
11+
from textblob import TextBlob
12+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
13+
14+
# Disabling warnings
15+
warnings.filterwarnings("ignore", category=UserWarning)
16+
17+
# --- Configuration ---
18+
BASE_DIR = Path(__file__).parent.resolve() # Project root folder
19+
INPUT_MP3_DIR = BASE_DIR / "input_mp3" # Input MP3s folder
20+
OUTPUT_TXT_DIR = BASE_DIR / "output_txt" # Result folder
21+
22+
# Create folders if there are none
23+
INPUT_MP3_DIR.mkdir(exist_ok=True)
24+
OUTPUT_TXT_DIR.mkdir(exist_ok=True)
25+
26+
# --- Loading models ---
27+
def load_models() -> Tuple[whisper.Whisper, AutoTokenizer, AutoModelForSequenceClassification]:
28+
"""Loading Whisper and RuBERT models with error handling."""
29+
try:
30+
print("Loading Whisper model...")
31+
model = whisper.load_model("medium")
32+
33+
print("Loading RuBERT model...")
34+
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
35+
bert_model = AutoModelForSequenceClassification.from_pretrained(
36+
"DeepPavlov/rubert-base-cased",
37+
num_labels=2
38+
)
39+
return model, tokenizer, bert_model
40+
except Exception as e:
41+
print(f"Error loading models: {e}")
42+
raise
43+
44+
whisper_model, rubert_tokenizer, rubert_model = load_models()
45+
46+
# --- Helper functions ---
47+
def convert_mp3_to_wav(mp3_path: Path, wav_path: Path) -> bool:
48+
"""Convert MP3 to WAV (16 kHz, mono) using ffmpeg."""
49+
try:
50+
cmd = [
51+
'ffmpeg',
52+
'-i', str(mp3_path),
53+
'-ar', '16000',
54+
'-ac', '1',
55+
'-f', 'wav',
56+
str(wav_path),
57+
'-y'
58+
]
59+
subprocess.run(
60+
cmd,
61+
stdout=subprocess.DEVNULL,
62+
stderr=subprocess.DEVNULL,
63+
check=True
64+
)
65+
return True
66+
except subprocess.CalledProcessError as e:
67+
print(f"Conversion error: {e}")
68+
return False
69+
70+
def determine_role_with_rubert(text: str) -> str:
71+
"""Classify speaker role (client/seller) using RuBERT."""
72+
try:
73+
inputs = rubert_tokenizer(
74+
text,
75+
return_tensors="pt",
76+
truncation=True,
77+
padding=True,
78+
max_length=512
79+
)
80+
with torch.no_grad():
81+
outputs = rubert_model(**inputs)
82+
predicted_label = torch.argmax(outputs.logits).item()
83+
return ["client", "seller"][predicted_label]
84+
except Exception as e:
85+
print(f"Classification error: {e}")
86+
return "unknown"
87+
88+
def analyze_sentiment(text: str) -> Dict[str, float]:
89+
"""Analyze text sentiment using TextBlob."""
90+
try:
91+
analysis = TextBlob(text)
92+
return {
93+
"polarity": analysis.sentiment.polarity,
94+
"subjectivity": analysis.sentiment.subjectivity
95+
}
96+
except Exception:
97+
return {"polarity": 0.0, "subjectivity": 0.0}
98+
99+
# --- Main processing ---
100+
def process_audio_file(mp3_path: Path, output_txt_path: Path) -> bool:
101+
"""Process single MP3 file through full pipeline."""
102+
# Step 1: Convert to WAV
103+
temp_wav = BASE_DIR / "temp.wav"
104+
if not convert_mp3_to_wav(mp3_path, temp_wav):
105+
return False
106+
107+
# Load audio for segmentation
108+
try:
109+
audio = AudioSegment.from_wav(str(temp_wav))
110+
except Exception as e:
111+
print(f"Error loading audio: {e}")
112+
temp_wav.unlink(missing_ok=True)
113+
return False
114+
115+
# Step 2: Voice activity detection
116+
print("Performing audio segmentation...")
117+
try:
118+
segmenter = Segmenter()
119+
segments = segmenter(str(temp_wav))
120+
except Exception as e:
121+
print(f"Segmentation error: {e}")
122+
temp_wav.unlink(missing_ok=True)
123+
return False
124+
125+
final_dialogue = []
126+
127+
# Step 3: Process each segment
128+
for seg in segments:
129+
label, seg_start, seg_end = seg
130+
if label not in {"male", "female"}:
131+
continue # Skip non-speech segments
132+
133+
# Convert time to milliseconds
134+
seg_start_ms = int(seg_start * 1000)
135+
seg_end_ms = int(seg_end * 1000)
136+
137+
# Extract audio segment
138+
segment_audio = audio[seg_start_ms:seg_end_ms]
139+
segment_wav = BASE_DIR / "segment_temp.wav"
140+
segment_audio.export(str(segment_wav), format="wav")
141+
142+
# Step 4: Transcribe with Whisper
143+
try:
144+
transcription_result = whisper_model.transcribe(
145+
str(segment_wav),
146+
language="ru",
147+
fp16=torch.cuda.is_available()
148+
)
149+
segment_text = transcription_result["text"].strip()
150+
except Exception as e:
151+
print(f"Transcription error: {e}")
152+
segment_text = ""
153+
154+
# Clean up segment file
155+
segment_wav.unlink(missing_ok=True)
156+
157+
if not segment_text:
158+
continue
159+
160+
# Step 5: Analyze segment
161+
role = determine_role_with_rubert(segment_text)
162+
sentiment = analyze_sentiment(segment_text)
163+
164+
entry = (
165+
f"{role.capitalize()} ({label}): {segment_text}\n"
166+
f"Sentiment: Polarity={sentiment['polarity']:.2f}, "
167+
f"Subjectivity={sentiment['subjectivity']:.2f}\n"
168+
f"Time: {seg_start:.1f}-{seg_end:.1f}s\n"
169+
)
170+
final_dialogue.append(entry)
171+
172+
# Step 6: Save results
173+
try:
174+
with open(output_txt_path, "w", encoding="utf-8") as f:
175+
f.write("\n".join(final_dialogue))
176+
except Exception as e:
177+
print(f"Error saving results: {e}")
178+
return False
179+
180+
# Clean up
181+
temp_wav.unlink(missing_ok=True)
182+
return True
183+
184+
# --- Entry point ---
185+
def main():
186+
print(f"Looking for MP3 files in {INPUT_MP3_DIR}...")
187+
mp3_files = list(INPUT_MP3_DIR.glob("*.mp3"))
188+
189+
if not mp3_files:
190+
print(f"No MP3 files found in {INPUT_MP3_DIR}")
191+
print(f"Please place your MP3 files in the '{INPUT_MP3_DIR.name}' folder")
192+
return
193+
194+
print(f"Found {len(mp3_files)} files to process")
195+
196+
for mp3_file in mp3_files:
197+
print(f"\nProcessing: {mp3_file.name}")
198+
output_txt = OUTPUT_TXT_DIR / f"{mp3_file.stem}.txt"
199+
200+
success = process_audio_file(mp3_file, output_txt)
201+
if success:
202+
print(f"Success! Results saved to {output_txt}")
203+
else:
204+
print(f"Failed to process {mp3_file.name}")
205+
206+
print("\nProcessing complete. Results saved in:", OUTPUT_TXT_DIR)
207+
208+
if __name__ == "__main__":
209+
main()

0 commit comments

Comments
 (0)