290 lines
9.1 KiB
C
290 lines
9.1 KiB
C
/*---------------------------------------------------------------------------*\
|
|
|
|
FILE........: phase.c
|
|
AUTHOR......: David Rowe
|
|
DATE CREATED: 1/2/09
|
|
|
|
Functions for modelling and synthesising phase.
|
|
|
|
\*---------------------------------------------------------------------------*/
|
|
|
|
/*
|
|
Copyright (C) 2009 David Rowe
|
|
|
|
All rights reserved.
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU Lesser General Public License version 2.1, as
|
|
published by the Free Software Foundation. This program is
|
|
distributed in the hope that it will be useful, but WITHOUT ANY
|
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
|
|
License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public License
|
|
along with this program; if not,see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "defines.h"
|
|
#include "phase.h"
|
|
#include "kiss_fft.h"
|
|
#include "comp.h"
|
|
#include "comp_prim.h"
|
|
#include "sine.h"
|
|
|
|
#include <assert.h>
|
|
#include <ctype.h>
|
|
#include <math.h>
|
|
#include <string.h>
|
|
#include <stdlib.h>
|
|
|
|
/*---------------------------------------------------------------------------*\
|
|
|
|
sample_phase()
|
|
|
|
Samples phase at centre of each harmonic from and array of FFT_ENC
|
|
DFT samples.
|
|
|
|
\*---------------------------------------------------------------------------*/
|
|
|
|
void sample_phase(MODEL *model,
|
|
COMP H[],
|
|
COMP A[] /* LPC analysis filter in freq domain */
|
|
)
|
|
{
|
|
int m, b;
|
|
float r;
|
|
|
|
r = TWO_PI/(FFT_ENC);
|
|
|
|
/* Sample phase at harmonics */
|
|
|
|
for(m=1; m<=model->L; m++) {
|
|
b = (int)(m*model->Wo/r + 0.5);
|
|
H[m] = cconj(A[b]); /* synth filter 1/A is opposite phase to analysis filter */
|
|
}
|
|
}
|
|
|
|
|
|
/*---------------------------------------------------------------------------*\
|
|
|
|
phase_synth_zero_order()
|
|
|
|
Synthesises phases based on SNR and a rule based approach. No phase
|
|
parameters are required apart from the SNR (which can be reduced to a
|
|
1 bit V/UV decision per frame).
|
|
|
|
The phase of each harmonic is modelled as the phase of a synthesis
|
|
filter excited by an impulse. In many Codec 2 modes the synthesis
|
|
filter is a LPC filter. Unlike the first order model the position
|
|
of the impulse is not transmitted, so we create an excitation pulse
|
|
train using a rule based approach.
|
|
|
|
Consider a pulse train with a pulse starting time n=0, with pulses
|
|
repeated at a rate of Wo, the fundamental frequency. A pulse train
|
|
in the time domain is equivalent to harmonics in the frequency
|
|
domain. We can make an excitation pulse train using a sum of
|
|
sinsusoids:
|
|
|
|
for(m=1; m<=L; m++)
|
|
ex[n] = cos(m*Wo*n)
|
|
|
|
Note: the Octave script ../octave/phase.m is an example of this if
|
|
you would like to try making a pulse train.
|
|
|
|
The phase of each excitation harmonic is:
|
|
|
|
arg(E[m]) = mWo
|
|
|
|
where E[m] are the complex excitation (freq domain) samples,
|
|
arg(x), just returns the phase of a complex sample x.
|
|
|
|
As we don't transmit the pulse position for this model, we need to
|
|
synthesise it. Now the excitation pulses occur at a rate of Wo.
|
|
This means the phase of the first harmonic advances by N_SAMP samples
|
|
over a synthesis frame of N_SAMP samples. For example if Wo is pi/20
|
|
(200 Hz), then over a 10ms frame (N_SAMP=80 samples), the phase of the
|
|
first harmonic would advance (pi/20)*80 = 4*pi or two complete
|
|
cycles.
|
|
|
|
We generate the excitation phase of the fundamental (first
|
|
harmonic):
|
|
|
|
arg[E[1]] = Wo*N_SAMP;
|
|
|
|
We then relate the phase of the m-th excitation harmonic to the
|
|
phase of the fundamental as:
|
|
|
|
arg(E[m]) = m*arg(E[1])
|
|
|
|
This E[m] then gets passed through the LPC synthesis filter to
|
|
determine the final harmonic phase.
|
|
|
|
Comparing to speech synthesised using original phases:
|
|
|
|
- Through headphones speech synthesised with this model is not as
|
|
good. Through a loudspeaker it is very close to original phases.
|
|
|
|
- If there are voicing errors, the speech can sound clicky or
|
|
staticy. If V speech is mistakenly declared UV, this model tends to
|
|
synthesise impulses or clicks, as there is usually very little shift or
|
|
dispersion through the LPC synthesis filter.
|
|
|
|
- When combined with LPC amplitude modelling there is an additional
|
|
drop in quality. I am not sure why, theory is interformant energy
|
|
is raised making any phase errors more obvious.
|
|
|
|
NOTES:
|
|
|
|
1/ This synthesis model is effectively the same as a simple LPC-10
|
|
vocoders, and yet sounds much better. Why? Conventional wisdom
|
|
(AMBE, MELP) says mixed voicing is required for high quality
|
|
speech.
|
|
|
|
2/ I am pretty sure the Lincoln Lab sinusoidal coding guys (like xMBE
|
|
also from MIT) first described this zero phase model, I need to look
|
|
up the paper.
|
|
|
|
3/ Note that this approach could cause some discontinuities in
|
|
the phase at the edge of synthesis frames, as no attempt is made
|
|
to make sure that the phase tracks are continuous (the excitation
|
|
phases are continuous, but not the final phases after filtering
|
|
by the LPC spectra). Technically this is a bad thing. However
|
|
this may actually be a good thing, disturbing the phase tracks a
|
|
bit. More research needed, e.g. test a synthesis model that adds
|
|
a small delta-W to make phase tracks line up for voiced
|
|
harmonics.
|
|
|
|
\*---------------------------------------------------------------------------*/
|
|
|
|
void phase_synth_zero_order(
|
|
int n_samp,
|
|
MODEL *model,
|
|
float *ex_phase, /* excitation phase of fundamental */
|
|
COMP H[] /* L synthesis filter freq domain samples */
|
|
|
|
)
|
|
{
|
|
int m;
|
|
float new_phi;
|
|
COMP Ex[MAX_AMP+1]; /* excitation samples */
|
|
COMP A_[MAX_AMP+1]; /* synthesised harmonic samples */
|
|
|
|
/*
|
|
Update excitation fundamental phase track, this sets the position
|
|
of each pitch pulse during voiced speech. After much experiment
|
|
I found that using just this frame's Wo improved quality for UV
|
|
sounds compared to interpolating two frames Wo like this:
|
|
|
|
ex_phase[0] += (*prev_Wo+model->Wo)*N_SAMP/2;
|
|
*/
|
|
|
|
ex_phase[0] += (model->Wo)*n_samp;
|
|
ex_phase[0] -= TWO_PI*floorf(ex_phase[0]/TWO_PI + 0.5);
|
|
|
|
for(m=1; m<=model->L; m++) {
|
|
|
|
/* generate excitation */
|
|
|
|
if (model->voiced) {
|
|
|
|
Ex[m].real = cosf(ex_phase[0]*m);
|
|
Ex[m].imag = sinf(ex_phase[0]*m);
|
|
}
|
|
else {
|
|
|
|
/* When a few samples were tested I found that LPC filter
|
|
phase is not needed in the unvoiced case, but no harm in
|
|
keeping it.
|
|
*/
|
|
float phi = TWO_PI*(float)codec2_rand()/CODEC2_RAND_MAX;
|
|
Ex[m].real = cosf(phi);
|
|
Ex[m].imag = sinf(phi);
|
|
}
|
|
|
|
/* filter using LPC filter */
|
|
|
|
A_[m].real = H[m].real*Ex[m].real - H[m].imag*Ex[m].imag;
|
|
A_[m].imag = H[m].imag*Ex[m].real + H[m].real*Ex[m].imag;
|
|
|
|
/* modify sinusoidal phase */
|
|
|
|
new_phi = atan2f(A_[m].imag, A_[m].real+1E-12);
|
|
model->phi[m] = new_phi;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
/*---------------------------------------------------------------------------*\
|
|
|
|
FUNCTION....: mag_to_phase
|
|
AUTHOR......: David Rowe
|
|
DATE CREATED: Jan 2017
|
|
|
|
Algorithm for http://www.dsprelated.com/showcode/20.php ported to C. See
|
|
also Octave function mag_to_phase.m
|
|
|
|
Given a magnitude spectrum in dB, returns a minimum-phase phase
|
|
spectra.
|
|
|
|
\*---------------------------------------------------------------------------*/
|
|
|
|
void mag_to_phase(float phase[], /* Nfft/2+1 output phase samples in radians */
|
|
float Gdbfk[], /* Nfft/2+1 postive freq amplitudes samples in dB */
|
|
int Nfft,
|
|
codec2_fft_cfg fft_fwd_cfg,
|
|
codec2_fft_cfg fft_inv_cfg
|
|
)
|
|
{
|
|
COMP Sdb[Nfft], c[Nfft], cf[Nfft], Cf[Nfft];
|
|
int Ns = Nfft/2+1;
|
|
int i;
|
|
|
|
/* install negative frequency components, 1/Nfft takes into
|
|
account kiss fft lack of scaling on ifft */
|
|
|
|
Sdb[0].real = Gdbfk[0];
|
|
Sdb[0].imag = 0.0;
|
|
for(i=1; i<Ns; i++) {
|
|
Sdb[i].real = Sdb[Nfft-i].real = Gdbfk[i];
|
|
Sdb[i].imag = Sdb[Nfft-i].imag = 0.0;
|
|
}
|
|
|
|
/* compute real cepstrum from log magnitude spectrum */
|
|
|
|
codec2_fft(fft_inv_cfg, Sdb, c);
|
|
for(i=0; i<Nfft; i++) {
|
|
c[i].real /= (float)Nfft;
|
|
c[i].imag /= (float)Nfft;
|
|
}
|
|
|
|
/* Fold cepstrum to reflect non-min-phase zeros inside unit circle */
|
|
|
|
cf[0] = c[0];
|
|
for(i=1; i<Ns-1; i++) {
|
|
cf[i] = cadd(c[i],c[Nfft-i]);
|
|
}
|
|
cf[Ns-1] = c[Ns-1];
|
|
for(i=Ns; i<Nfft; i++) {
|
|
cf[i].real = 0.0;
|
|
cf[i].imag = 0.0;
|
|
}
|
|
|
|
/* Cf = dB_magnitude + j * minimum_phase */
|
|
|
|
codec2_fft(fft_fwd_cfg, cf, Cf);
|
|
|
|
/* The maths says we are meant to be using log(x), not 20*log10(x),
|
|
so we need to scale the phase to account for this:
|
|
log(x) = 20*log10(x)/scale */
|
|
|
|
float scale = (20.0/logf(10.0));
|
|
|
|
for(i=0; i<Ns; i++) {
|
|
phase[i] = Cf[i].imag/scale;
|
|
}
|
|
|
|
|
|
}
|