MFCC
LTS
-
fiddy Messages postés 11069 Date d'inscription Statut Contributeur Dernière intervention -
fiddy Messages postés 11069 Date d'inscription Statut Contributeur Dernière intervention -
Bonjour,
je suis entrain de réaliser un projet de reconnaissance vocal, je veux utiliser MFCC (extraction des vecteurs acoustiques) et RNA, je vous demande le code source complet en C de (MFCC).
NB suite mes recherches sur internet je trouve le code suivant mais n'est pas complet, il manque des bibliothèques .
merci.
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <arpa/inet.h>
#include "fft.h"
#include "dct.h"
typedef char BYTE;
typedef uint32_t DWORD;
#ifndef PI
#define PI 3.1415926535897932
#endif
FILE* Setup_Wave_Read(const char *fname,int *,int *);
int Load_Chunk(FILE *,complex *, int);
void Setup_Mel(int,int);
void Calculate_Mel(double *, double *, int);
int main(int argc, char* argv[])
{
char* filename; FILE *fp; int sample_rate;
double melscale[40];
if( argc == 2 ) {
filename = argv[1];
}
else {
printf("Usage: ./mfcc <filename>\n");
exit(-1);
}
int fft_size;
fp = Setup_Wave_Read(filename,&fft_size,&sample_rate);
Setup_Mel(fft_size,sample_rate);
printf("fft_size: %d\n",fft_size);
complex* fft;
complex* sound_buffer;
int i,not_empty;
double *value;
double rate;
double *mfcc;
rate = (double)fft_size/sample_rate;
sound_buffer = (complex *)malloc(sizeof(complex)*fft_size);
value = (double *)malloc(sizeof(double)*fft_size*2);
int frame=0;
printf("frame,time,mfcc#,coeff\n");
do {
not_empty = Load_Chunk(fp,sound_buffer,fft_size);
fft = FFT_simple(sound_buffer,fft_size);
// calculate bin energy from real half (frame 0)
for (i=0; i<fft_size; i++) value[i] = fabs(fft[i].re + fft[fft_size - i - 1].re + fft[i].im - fft[fft_size - i - 1].im)/2;
// calcuate bin energy from complex half (frame 1)
for (i=0; i<fft_size; i++) value[i+fft_size] = fabs(fft[i].re + fft[i].im + fft[fft_size - i - 1].im - fft[fft_size - i - 1].re)/2;
// gather magnitudes (unmirror bin energies)
for (i=0; i<fft_size/2; i++) {
value[i] += value[fft_size - i - 1];
value[fft_size + i] += value[2*fft_size - i - 1];
}
Calculate_Mel(value,melscale,fft_size);
mfcc = DCT_II(melscale,40);
/*for (i=0; i<40; i++) printf("%d,%f,%d,%f\n",frame,rate*frame/2,i,mfcc[i]);*/
fprintf(stderr,".");
if ((frame & 0x1FF) == 0x1FF) fprintf(stderr,"+");
free(mfcc);
// for (i=0; i<fft_size/2; i++) printf("%d,%f,%f,%f\n",frame,2595 * log10(1 + ((1/rate)*i)/700),(1/rate)*i,value[i]);
frame++;
Calculate_Mel(&value[fft_size],melscale,fft_size);
mfcc = DCT_II(melscale,40);
/*for (i=0; i<40; i++) printf("%d,%f,%d,%f\n",frame,rate*frame/2,i,mfcc[i]);*/
fprintf(stderr,".");
free(mfcc);
// for (i=0; i<fft_size/2; i++) printf("%d,%f,%f,%f\n",frame,2595 * log10(1 + ((1/rate)*i)/700),(1/rate)*i,value[i+fft_size]);
frame++;
free(fft);
} while (not_empty);
free(value);
free(sound_buffer);
fclose(fp);
printf("\n");
return 0;
}
double *mel[40];
int melstart[40];
int mellength[40];
void Calculate_Mel(double *freq, double *result, int fft_size) {
int i,j;
memset(result,0,sizeof(double)*40);
for (i=0; i<40; i++) {
for (j=0; j<mellength[i]; j++) result[i] += mel[i][j]*freq[j+melstart[i]];
result[i] = log(result[i]);
}
}
void Setup_Mel(int fft_size, int sample_rate) {
int i,j,k;
double fmax;
double dphi;
double fsample;
double freq;
double temp[fft_size/2];
fmax=2595*log10(8000.0f/700+1);
dphi = fmax/41;
freq = (double)sample_rate/fft_size;
for (i=0; i<40; i++) {
melstart[i]=fft_size/2;
mellength[i]=0;
memset(temp,0,sizeof(double)*fft_size/2);
for (j=0; j<fft_size/2; j++) {
fsample = 2595*log10(freq*j/700 + 1);
if ((dphi*i <= fsample) && (fsample < dphi*(i+1))) temp[j] = (fsample-dphi*i)/(dphi*(i+1)-dphi*i);
if ((dphi*(i+1) <= fsample) && (fsample < dphi*(i+2))) temp[j] = (fsample-dphi*(i+2))/(dphi*(i+1)-dphi*(i+2));
if ((temp[j] != 0) && (melstart[i] > j)) melstart[i] = j;
if (temp[j] != 0) mellength[i]++;
}
mel[i] = malloc(sizeof(double)*mellength[i]);
memcpy(mel[i],&temp[melstart[i]],mellength[i]*sizeof(double));
// for (k=0; k<mellength[i]; k++) printf("mel filter: %d, %d, %f\n",i,melstart[i]+k,mel[i][k]);
}
}
double *hamming=NULL;
FILE* Setup_Wave_Read(const char *fname,int *fft_size, int *sample_rate)
{
FILE *fp;
int i,frame_size;
printf ("Opening %s\n",fname);
fp = fopen(fname,"rb");
if (fp)
{
BYTE id[4]; //four bytes to hold 'RIFF'
complex* complex_buffer;
DWORD size; //32 bit value to hold file size
short format_tag, channels, block_align, bits_per_sample, something, temp; //our 16 values
DWORD format_length, avg_bytes_sec, data_size, i; //our 32 bit values
fread(id, sizeof(BYTE), 4, fp); //read in first four bytes
if (!strncmp(id, "RIFF",4))
{ //we had 'RIFF' let's continue
fread(&size, sizeof(DWORD), 1, fp); //read in 32bit size value
printf("size=%d\n", size);
fread(id, sizeof(BYTE), 4, fp); //read in 4 byte string now
printf("id=%4.4s\n", id);
if (!strncmp(id,"WAVE",4))
{ //this is probably a wave file since it contained "WAVE"
fread(id, sizeof(BYTE), 4, fp); //read in 4 bytes "fmt ";
printf("id=%4.4s\n", id);
fread(&format_length, sizeof(DWORD),1,fp);
printf("format_length=%d\n", format_length);
fread(&format_tag, sizeof(short), 1, fp); //check mmreg.h (i think?) for other
// possible format tags like ADPCM
printf("format_tag=%d\n", format_tag);
fread(&channels, sizeof(short),1,fp); //1 mono, 2 stereo
printf("channels=%d\n", channels);
fread(sample_rate, sizeof(DWORD), 1, fp); //like 44100, 22050, etc...
printf("sample_rate=%d\n", *sample_rate);
fread(&avg_bytes_sec, sizeof(short), 1, fp); //probably won't need this
printf("avg_bytes_sec=%d\n", avg_bytes_sec);
fread(&block_align, sizeof(short), 1, fp); //probably won't need this
printf("block_align=%d\n", block_align);
fread(&bits_per_sample, sizeof(short), 1, fp); //8 bit or 16 bit file?
printf("bits_per_sample=%d\n", bits_per_sample);
fread(&something, sizeof(short), 1, fp); // 2 bytes of something!
printf("something=%d\n", something);
fread(id, sizeof(BYTE), 4, fp); //read in 'data'
printf("id=%4.4s\n", id);
fread(&data_size, sizeof(DWORD), 1, fp); //how many bytes of sound data we have
printf("data_size=%d\n", data_size);
}
frame_size = *sample_rate / 20;
// calculate MSB
*fft_size = frame_size;
*fft_size |= *fft_size >> 1;
*fft_size |= *fft_size >> 2;
*fft_size |= *fft_size >> 4;
*fft_size |= *fft_size >> 8;
*fft_size |= *fft_size >> 16;
*fft_size = *fft_size - (*fft_size >> 1);
if (frame_size > *fft_size) *fft_size <<=1;
hamming = (double *) malloc(sizeof(double)* (*fft_size));
for (i=0; i<*fft_size; i++) {
hamming[i] = 0.54f - 0.46f * cos( (double)(2 * PI * i) / (double) (*fft_size - 1) );
// printf("hamming,%d,%f\n",i,hamming[i]);
}
return fp;
}
}
return NULL;
}
int Load_Chunk(FILE *fp, complex *complex_buffer, int fftsize) {
int i;
short temp;
if(complex_buffer)
{
// seek backwards 1/2 frame (fftsize/2*2bytes/value)
fseek(fp,-fftsize,SEEK_CUR);
for (i=0; i<fftsize; i++) {
fread(&temp,sizeof(short),1,fp);
complex_buffer[i].re = hamming[i]*(double)temp/32767;
// complex_buffer[i].re = (double)temp/32767;
}
// seek backwards 1/2 frame (fftsize/2*2bytes/value)
fseek(fp,-fftsize,SEEK_CUR);
for (i=0; i<fftsize; i++) {
fread(&temp,sizeof(short),1,fp);
complex_buffer[i].im = hamming[i]*(double)temp/32767;
// complex_buffer[i].im = (double)temp/32767;
}
}
return !feof(fp);
}
je suis entrain de réaliser un projet de reconnaissance vocal, je veux utiliser MFCC (extraction des vecteurs acoustiques) et RNA, je vous demande le code source complet en C de (MFCC).
NB suite mes recherches sur internet je trouve le code suivant mais n'est pas complet, il manque des bibliothèques .
merci.
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <arpa/inet.h>
#include "fft.h"
#include "dct.h"
typedef char BYTE;
typedef uint32_t DWORD;
#ifndef PI
#define PI 3.1415926535897932
#endif
FILE* Setup_Wave_Read(const char *fname,int *,int *);
int Load_Chunk(FILE *,complex *, int);
void Setup_Mel(int,int);
void Calculate_Mel(double *, double *, int);
int main(int argc, char* argv[])
{
char* filename; FILE *fp; int sample_rate;
double melscale[40];
if( argc == 2 ) {
filename = argv[1];
}
else {
printf("Usage: ./mfcc <filename>\n");
exit(-1);
}
int fft_size;
fp = Setup_Wave_Read(filename,&fft_size,&sample_rate);
Setup_Mel(fft_size,sample_rate);
printf("fft_size: %d\n",fft_size);
complex* fft;
complex* sound_buffer;
int i,not_empty;
double *value;
double rate;
double *mfcc;
rate = (double)fft_size/sample_rate;
sound_buffer = (complex *)malloc(sizeof(complex)*fft_size);
value = (double *)malloc(sizeof(double)*fft_size*2);
int frame=0;
printf("frame,time,mfcc#,coeff\n");
do {
not_empty = Load_Chunk(fp,sound_buffer,fft_size);
fft = FFT_simple(sound_buffer,fft_size);
// calculate bin energy from real half (frame 0)
for (i=0; i<fft_size; i++) value[i] = fabs(fft[i].re + fft[fft_size - i - 1].re + fft[i].im - fft[fft_size - i - 1].im)/2;
// calcuate bin energy from complex half (frame 1)
for (i=0; i<fft_size; i++) value[i+fft_size] = fabs(fft[i].re + fft[i].im + fft[fft_size - i - 1].im - fft[fft_size - i - 1].re)/2;
// gather magnitudes (unmirror bin energies)
for (i=0; i<fft_size/2; i++) {
value[i] += value[fft_size - i - 1];
value[fft_size + i] += value[2*fft_size - i - 1];
}
Calculate_Mel(value,melscale,fft_size);
mfcc = DCT_II(melscale,40);
/*for (i=0; i<40; i++) printf("%d,%f,%d,%f\n",frame,rate*frame/2,i,mfcc[i]);*/
fprintf(stderr,".");
if ((frame & 0x1FF) == 0x1FF) fprintf(stderr,"+");
free(mfcc);
// for (i=0; i<fft_size/2; i++) printf("%d,%f,%f,%f\n",frame,2595 * log10(1 + ((1/rate)*i)/700),(1/rate)*i,value[i]);
frame++;
Calculate_Mel(&value[fft_size],melscale,fft_size);
mfcc = DCT_II(melscale,40);
/*for (i=0; i<40; i++) printf("%d,%f,%d,%f\n",frame,rate*frame/2,i,mfcc[i]);*/
fprintf(stderr,".");
free(mfcc);
// for (i=0; i<fft_size/2; i++) printf("%d,%f,%f,%f\n",frame,2595 * log10(1 + ((1/rate)*i)/700),(1/rate)*i,value[i+fft_size]);
frame++;
free(fft);
} while (not_empty);
free(value);
free(sound_buffer);
fclose(fp);
printf("\n");
return 0;
}
double *mel[40];
int melstart[40];
int mellength[40];
void Calculate_Mel(double *freq, double *result, int fft_size) {
int i,j;
memset(result,0,sizeof(double)*40);
for (i=0; i<40; i++) {
for (j=0; j<mellength[i]; j++) result[i] += mel[i][j]*freq[j+melstart[i]];
result[i] = log(result[i]);
}
}
void Setup_Mel(int fft_size, int sample_rate) {
int i,j,k;
double fmax;
double dphi;
double fsample;
double freq;
double temp[fft_size/2];
fmax=2595*log10(8000.0f/700+1);
dphi = fmax/41;
freq = (double)sample_rate/fft_size;
for (i=0; i<40; i++) {
melstart[i]=fft_size/2;
mellength[i]=0;
memset(temp,0,sizeof(double)*fft_size/2);
for (j=0; j<fft_size/2; j++) {
fsample = 2595*log10(freq*j/700 + 1);
if ((dphi*i <= fsample) && (fsample < dphi*(i+1))) temp[j] = (fsample-dphi*i)/(dphi*(i+1)-dphi*i);
if ((dphi*(i+1) <= fsample) && (fsample < dphi*(i+2))) temp[j] = (fsample-dphi*(i+2))/(dphi*(i+1)-dphi*(i+2));
if ((temp[j] != 0) && (melstart[i] > j)) melstart[i] = j;
if (temp[j] != 0) mellength[i]++;
}
mel[i] = malloc(sizeof(double)*mellength[i]);
memcpy(mel[i],&temp[melstart[i]],mellength[i]*sizeof(double));
// for (k=0; k<mellength[i]; k++) printf("mel filter: %d, %d, %f\n",i,melstart[i]+k,mel[i][k]);
}
}
double *hamming=NULL;
FILE* Setup_Wave_Read(const char *fname,int *fft_size, int *sample_rate)
{
FILE *fp;
int i,frame_size;
printf ("Opening %s\n",fname);
fp = fopen(fname,"rb");
if (fp)
{
BYTE id[4]; //four bytes to hold 'RIFF'
complex* complex_buffer;
DWORD size; //32 bit value to hold file size
short format_tag, channels, block_align, bits_per_sample, something, temp; //our 16 values
DWORD format_length, avg_bytes_sec, data_size, i; //our 32 bit values
fread(id, sizeof(BYTE), 4, fp); //read in first four bytes
if (!strncmp(id, "RIFF",4))
{ //we had 'RIFF' let's continue
fread(&size, sizeof(DWORD), 1, fp); //read in 32bit size value
printf("size=%d\n", size);
fread(id, sizeof(BYTE), 4, fp); //read in 4 byte string now
printf("id=%4.4s\n", id);
if (!strncmp(id,"WAVE",4))
{ //this is probably a wave file since it contained "WAVE"
fread(id, sizeof(BYTE), 4, fp); //read in 4 bytes "fmt ";
printf("id=%4.4s\n", id);
fread(&format_length, sizeof(DWORD),1,fp);
printf("format_length=%d\n", format_length);
fread(&format_tag, sizeof(short), 1, fp); //check mmreg.h (i think?) for other
// possible format tags like ADPCM
printf("format_tag=%d\n", format_tag);
fread(&channels, sizeof(short),1,fp); //1 mono, 2 stereo
printf("channels=%d\n", channels);
fread(sample_rate, sizeof(DWORD), 1, fp); //like 44100, 22050, etc...
printf("sample_rate=%d\n", *sample_rate);
fread(&avg_bytes_sec, sizeof(short), 1, fp); //probably won't need this
printf("avg_bytes_sec=%d\n", avg_bytes_sec);
fread(&block_align, sizeof(short), 1, fp); //probably won't need this
printf("block_align=%d\n", block_align);
fread(&bits_per_sample, sizeof(short), 1, fp); //8 bit or 16 bit file?
printf("bits_per_sample=%d\n", bits_per_sample);
fread(&something, sizeof(short), 1, fp); // 2 bytes of something!
printf("something=%d\n", something);
fread(id, sizeof(BYTE), 4, fp); //read in 'data'
printf("id=%4.4s\n", id);
fread(&data_size, sizeof(DWORD), 1, fp); //how many bytes of sound data we have
printf("data_size=%d\n", data_size);
}
frame_size = *sample_rate / 20;
// calculate MSB
*fft_size = frame_size;
*fft_size |= *fft_size >> 1;
*fft_size |= *fft_size >> 2;
*fft_size |= *fft_size >> 4;
*fft_size |= *fft_size >> 8;
*fft_size |= *fft_size >> 16;
*fft_size = *fft_size - (*fft_size >> 1);
if (frame_size > *fft_size) *fft_size <<=1;
hamming = (double *) malloc(sizeof(double)* (*fft_size));
for (i=0; i<*fft_size; i++) {
hamming[i] = 0.54f - 0.46f * cos( (double)(2 * PI * i) / (double) (*fft_size - 1) );
// printf("hamming,%d,%f\n",i,hamming[i]);
}
return fp;
}
}
return NULL;
}
int Load_Chunk(FILE *fp, complex *complex_buffer, int fftsize) {
int i;
short temp;
if(complex_buffer)
{
// seek backwards 1/2 frame (fftsize/2*2bytes/value)
fseek(fp,-fftsize,SEEK_CUR);
for (i=0; i<fftsize; i++) {
fread(&temp,sizeof(short),1,fp);
complex_buffer[i].re = hamming[i]*(double)temp/32767;
// complex_buffer[i].re = (double)temp/32767;
}
// seek backwards 1/2 frame (fftsize/2*2bytes/value)
fseek(fp,-fftsize,SEEK_CUR);
for (i=0; i<fftsize; i++) {
fread(&temp,sizeof(short),1,fp);
complex_buffer[i].im = hamming[i]*(double)temp/32767;
// complex_buffer[i].im = (double)temp/32767;
}
}
return !feof(fp);
}