From 0af884e70ea96589fd54bbeea0c2459b7ef61f5b Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Wed, 12 Jul 2017 13:50:05 +1000 Subject: [PATCH 01/20] moved Google recogniser into own package, notes for VAD --- .../{recognizer => encoding}/FlacEncoder.java | 2 +- .../speech/microphone/Microphone.java | 47 ++++++++++++++--- .../{ => google}/GSpeechDuplex.java | 4 +- .../{ => google}/GSpeechResponseListener.java | 3 +- .../{ => google}/GoogleResponse.java | 2 +- .../recognizer/{ => google}/Recognizer.java | 4 +- .../{ => google}/RecognizerChunked.java | 2 +- .../recognizer/vad/VoiceActivityDetector.java | 50 +++++++++++++++++++ 8 files changed, 100 insertions(+), 14 deletions(-) rename src/main/java/com/darkprograms/speech/{recognizer => encoding}/FlacEncoder.java (98%) rename src/main/java/com/darkprograms/speech/recognizer/{ => google}/GSpeechDuplex.java (99%) rename src/main/java/com/darkprograms/speech/recognizer/{ => google}/GSpeechResponseListener.java (75%) rename src/main/java/com/darkprograms/speech/recognizer/{ => google}/GoogleResponse.java (97%) rename src/main/java/com/darkprograms/speech/recognizer/{ => google}/Recognizer.java (99%) rename src/main/java/com/darkprograms/speech/recognizer/{ => google}/RecognizerChunked.java (99%) create mode 100644 src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java diff --git a/src/main/java/com/darkprograms/speech/recognizer/FlacEncoder.java b/src/main/java/com/darkprograms/speech/encoding/FlacEncoder.java similarity index 98% rename from src/main/java/com/darkprograms/speech/recognizer/FlacEncoder.java rename to src/main/java/com/darkprograms/speech/encoding/FlacEncoder.java index 180ab9a..bf06c25 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/FlacEncoder.java +++ b/src/main/java/com/darkprograms/speech/encoding/FlacEncoder.java @@ -1,4 +1,4 @@ -package com.darkprograms.speech.recognizer; +package com.darkprograms.speech.encoding; import net.sourceforge.javaflacencoder.FLACEncoder; import net.sourceforge.javaflacencoder.FLACFileOutputStream; diff --git a/src/main/java/com/darkprograms/speech/microphone/Microphone.java b/src/main/java/com/darkprograms/speech/microphone/Microphone.java index 7859050..7ec1425 100644 --- a/src/main/java/com/darkprograms/speech/microphone/Microphone.java +++ b/src/main/java/com/darkprograms/speech/microphone/Microphone.java @@ -40,6 +40,8 @@ public enum CaptureState { */ private File audioFile; + private AudioInputStream audioStream; + /** * Constructor * @@ -100,7 +102,11 @@ public void setTargetDataLine(TargetDataLine targetDataLine) { /** * Initializes the target data line. */ - private void initTargetDataLine(){ + + private void initTargetDataLine() { + initTargetDataLine(8_000F); + } + private void initTargetDataLine(float sampleRate) { DataLine.Info dataLineInfo = new DataLine.Info(TargetDataLine.class, getAudioFormat()); try { setTargetDataLine((TargetDataLine) AudioSystem.getLine(dataLineInfo)); @@ -112,6 +118,22 @@ private void initTargetDataLine(){ } + /** + * @param sampleRate recommend 16_000F or 8_000F + */ + public AudioInputStream captureAudioToStream(float sampleRate) { + setState(CaptureState.STARTING_CAPTURE); + if(getTargetDataLine() == null){ + initTargetDataLine(sampleRate); + } + + //Get Audio +// new Thread(new ListenThread()).start(); + open(); + audioStream = new AudioInputStream(getTargetDataLine()); + return audioStream; + } + /** * Captures audio from the microphone and saves it a file @@ -129,8 +151,6 @@ public void captureAudioToFile(File audioFile) throws LineUnavailableException { //Get Audio new Thread(new CaptureThread()).start(); - - } /** @@ -144,14 +164,19 @@ public void captureAudioToFile(String audioFile) throws LineUnavailableException captureAudioToFile(file); } - /** * The audio format to save in * * @return Returns AudioFormat to be used later when capturing audio from microphone */ public AudioFormat getAudioFormat() { - float sampleRate = 8000.0F; + return getAudioFormat(8000.0F); + } + + /** + * @param sampleRate set to 16_000.0F for AWS Lex + */ + public AudioFormat getAudioFormat(float sampleRate) { //8000,11025,16000,22050,44100 int sampleSizeInBits = 16; //8,16 @@ -183,7 +208,6 @@ public void open(){ return; } } - } /** @@ -203,7 +227,6 @@ public void close() { * Thread to capture the audio from the microphone and save it to a file */ private class CaptureThread implements Runnable { - /** * Run method for thread */ @@ -220,4 +243,14 @@ public void run() { } } + /*private class ListenThread implements Runnable { + public void run() { + try { + open(); + audioStream = new AudioInputStream(getTargetDataLine()); + } catch (Exception ex) { + ex.printStackTrace(); + } + } + }*/ } diff --git a/src/main/java/com/darkprograms/speech/recognizer/GSpeechDuplex.java b/src/main/java/com/darkprograms/speech/recognizer/google/GSpeechDuplex.java similarity index 99% rename from src/main/java/com/darkprograms/speech/recognizer/GSpeechDuplex.java rename to src/main/java/com/darkprograms/speech/recognizer/google/GSpeechDuplex.java index a66e844..ed4421d 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/GSpeechDuplex.java +++ b/src/main/java/com/darkprograms/speech/recognizer/google/GSpeechDuplex.java @@ -1,4 +1,4 @@ -package com.darkprograms.speech.recognizer; +package com.darkprograms.speech.recognizer.google; import java.io.File; import java.io.IOException; @@ -184,7 +184,7 @@ public void recognize(TargetDataLine tl, AudioFormat af) throws IOException, Lin /** * This code opens a new Thread that connects to the downstream URL. Due to threading, * the best way to handle this is through the use of listeners. - * @param The URL you want to connect to. + * @param urlStr The URL you want to connect to. */ private Thread downChannel(String urlStr) { final String url = urlStr; diff --git a/src/main/java/com/darkprograms/speech/recognizer/GSpeechResponseListener.java b/src/main/java/com/darkprograms/speech/recognizer/google/GSpeechResponseListener.java similarity index 75% rename from src/main/java/com/darkprograms/speech/recognizer/GSpeechResponseListener.java rename to src/main/java/com/darkprograms/speech/recognizer/google/GSpeechResponseListener.java index dcbbf2a..aca0228 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/GSpeechResponseListener.java +++ b/src/main/java/com/darkprograms/speech/recognizer/google/GSpeechResponseListener.java @@ -1,4 +1,5 @@ -package com.darkprograms.speech.recognizer; +package com.darkprograms.speech.recognizer.google + ; /** * Response listeners for URL connections. diff --git a/src/main/java/com/darkprograms/speech/recognizer/GoogleResponse.java b/src/main/java/com/darkprograms/speech/recognizer/google/GoogleResponse.java similarity index 97% rename from src/main/java/com/darkprograms/speech/recognizer/GoogleResponse.java rename to src/main/java/com/darkprograms/speech/recognizer/google/GoogleResponse.java index 73a86f4..d04dd37 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/GoogleResponse.java +++ b/src/main/java/com/darkprograms/speech/recognizer/google/GoogleResponse.java @@ -1,4 +1,4 @@ -package com.darkprograms.speech.recognizer; +package com.darkprograms.speech.recognizer.google; import java.util.ArrayList; import java.util.List; diff --git a/src/main/java/com/darkprograms/speech/recognizer/Recognizer.java b/src/main/java/com/darkprograms/speech/recognizer/google/Recognizer.java similarity index 99% rename from src/main/java/com/darkprograms/speech/recognizer/Recognizer.java rename to src/main/java/com/darkprograms/speech/recognizer/google/Recognizer.java index cac98a3..18d6fa0 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/Recognizer.java +++ b/src/main/java/com/darkprograms/speech/recognizer/google/Recognizer.java @@ -1,4 +1,4 @@ -package com.darkprograms.speech.recognizer; +package com.darkprograms.speech.recognizer.google; import java.util.*; import java.io.*; @@ -8,6 +8,8 @@ import org.json.*; +import com.darkprograms.speech.encoding.FlacEncoder; + /*************************************************************** * Class that submits FLAC audio and retrieves recognized text * diff --git a/src/main/java/com/darkprograms/speech/recognizer/RecognizerChunked.java b/src/main/java/com/darkprograms/speech/recognizer/google/RecognizerChunked.java similarity index 99% rename from src/main/java/com/darkprograms/speech/recognizer/RecognizerChunked.java rename to src/main/java/com/darkprograms/speech/recognizer/google/RecognizerChunked.java index 160b395..62614df 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/RecognizerChunked.java +++ b/src/main/java/com/darkprograms/speech/recognizer/google/RecognizerChunked.java @@ -1,4 +1,4 @@ -package com.darkprograms.speech.recognizer; +package com.darkprograms.speech.recognizer.google; import java.io.BufferedReader; diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java new file mode 100644 index 0000000..098c425 --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java @@ -0,0 +1,50 @@ +package com.darkprograms.speech.recognizer.vad; + +import com.darkprograms.speech.microphone.MicrophoneAnalyzer; + +import javax.sound.sampled.AudioInputStream; + +/** + * @see https://www.researchgate.net/publication/255667085_A_simple_but_efficient_real-time_voice_activity_detection_algorithm + */ +public class VoiceActivityDetector { + public void detect(AudioInputStream audio) { + // E: short-term energy + // SFM: Spectral Flatness Measure - a measure of the noisiness of spectrum and is a good feature in Voiced/Unvoiced/Silence detection. + // SFM = 10 log10(Gm/Am) + // Am: arithmetic mean + // Gm: geometric mean + // F: most dominant frequency component of the speech frame spectrum + + + // 1- Set Frame _ Size 10ms=and compute number of frames (FramesOfNum __ )(no frame overlap is required) + // 2- Set one primary threshold for each feature {These thresholds are the only parameters that are set externally} + // • Primary Threshold for Energy (Energy_PrimThresh) + // • Primary Threshold for F (F_PrimThresh) + // • Primary Threshold for SFM (SF_PrimThresh) + // 3- for i from 1 to numOfFrames + // 3-1- Compute frame energy E(i) + // 3-2- Apply FFT on each speech frame. + // 3-2-1- Find kF(i) = arg max(S(k)) as the most dominant frequency component. + // 3-2-2- Compute the abstract value of Spectral Flatness Measure SFM(i) + // 3-3- Supposing that some of the first 30 frames are silence, find the minimum value for E(minE) , F(minF) and SFM (minSF) + // 3-4- Set Decision threshold forE, F and SFM + // • threshE = energyPrimThresh * log(minE) + // • threshF = fPrimThresh + // • threshSF = sfPrimThresh + // 3-5- Set 0=Counter + // • if ((E(i) - minE) >= threshE) then ++Counter + // • if ((F(i) - minF) >= threshF) then ++Counter + // • if ((SFM(i) - minSF) >= threshSF) then ++Counter + // 3-6- If Counter > mark the current frame as speech else mark it as silence. + // 3-7- If current frame is marked as silence, update the energy minimum value: + // minE = (silenceCount * minE) + E(i)) / (silenceCount + 1) + // 3-8- threshE = energyPrimThresh * log(minE) + // 4- Ignore silence run less than 10 successive frames. + // 5- Ignore speech run less than 5 successive frames. + + +// MicrophoneAnalyzer mic = new MicrophoneAnalyzer(null); +// mic.captureAudioToStream(16_000F); + } +} From 99755821e37f558017065dd5ef8cb742f82b0bc3 Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Wed, 12 Jul 2017 13:51:44 +1000 Subject: [PATCH 02/20] notes for VAD --- .../speech/recognizer/vad/VoiceActivityDetector.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java index 098c425..0389e9c 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java @@ -8,6 +8,9 @@ * @see https://www.researchgate.net/publication/255667085_A_simple_but_efficient_real-time_voice_activity_detection_algorithm */ public class VoiceActivityDetector { + +// MicrophoneAnalyzer mic = new MicrophoneAnalyzer(null); +// mic.captureAudioToStream(16_000F); public void detect(AudioInputStream audio) { // E: short-term energy // SFM: Spectral Flatness Measure - a measure of the noisiness of spectrum and is a good feature in Voiced/Unvoiced/Silence detection. @@ -43,8 +46,5 @@ public void detect(AudioInputStream audio) { // 4- Ignore silence run less than 10 successive frames. // 5- Ignore speech run less than 5 successive frames. - -// MicrophoneAnalyzer mic = new MicrophoneAnalyzer(null); -// mic.captureAudioToStream(16_000F); } } From 9998ab01625cd18a0424669e0e32b7de331177d2 Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Wed, 12 Jul 2017 14:33:40 +1000 Subject: [PATCH 03/20] more notes --- .../recognizer/vad/VoiceActivityDetector.java | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java index 0389e9c..58f3099 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java @@ -1,11 +1,14 @@ package com.darkprograms.speech.recognizer.vad; import com.darkprograms.speech.microphone.MicrophoneAnalyzer; +import com.darkprograms.speech.util.FFT; import javax.sound.sampled.AudioInputStream; /** * @see https://www.researchgate.net/publication/255667085_A_simple_but_efficient_real-time_voice_activity_detection_algorithm + * + * @see https://github.com/Sciss/SpeechRecognitionHMM/blob/master/src/main/java/org/ioe/tprsa/audio/preProcessings/EndPointDetection.java */ public class VoiceActivityDetector { @@ -27,9 +30,18 @@ public void detect(AudioInputStream audio) { // • Primary Threshold for SFM (SF_PrimThresh) // 3- for i from 1 to numOfFrames // 3-1- Compute frame energy E(i) +// mic.calculateRMSLevel(byte[] audioData) + // 3-2- Apply FFT on each speech frame. - // 3-2-1- Find kF(i) = arg max(S(k)) as the most dominant frequency component. +// FFT.fft(Complex[] x) +// https://github.com/Sciss/SpeechRecognitionHMM/blob/master/src/main/java/org/ioe/tprsa/audio/feature/FFT.java + + // 3-2-1- Find F(i) = arg max(S(k)) as the most dominant frequency component. +// getFrequency(byte[] bytes) ???? + // 3-2-2- Compute the abstract value of Spectral Flatness Measure SFM(i) +// https://github.com/filipeuva/SoundBites/blob/master/src/uk/co/biogen/SoundBites/analysis/AnalysisInterface.java#L264 + // 3-3- Supposing that some of the first 30 frames are silence, find the minimum value for E(minE) , F(minF) and SFM (minSF) // 3-4- Set Decision threshold forE, F and SFM // • threshE = energyPrimThresh * log(minE) From a5b7b72fb1ab4721b4c639533df19bf0f01f9c55 Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Wed, 12 Jul 2017 14:34:41 +1000 Subject: [PATCH 04/20] more notes --- .../speech/recognizer/vad/VoiceActivityDetector.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java index 58f3099..e9eff18 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java @@ -6,9 +6,9 @@ import javax.sound.sampled.AudioInputStream; /** - * @see https://www.researchgate.net/publication/255667085_A_simple_but_efficient_real-time_voice_activity_detection_algorithm + * @see [https://www.researchgate.net/publication/255667085_A_simple_but_efficient_real-time_voice_activity_detection_algorithm] * - * @see https://github.com/Sciss/SpeechRecognitionHMM/blob/master/src/main/java/org/ioe/tprsa/audio/preProcessings/EndPointDetection.java + * @see [https://github.com/Sciss/SpeechRecognitionHMM/blob/master/src/main/java/org/ioe/tprsa/audio/preProcessings/EndPointDetection.java] */ public class VoiceActivityDetector { From cca6c47b898a29ecd21015bee1847ff5173b2709 Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Thu, 13 Jul 2017 01:18:24 +1000 Subject: [PATCH 05/20] VAD implemented except for SFM --- .../speech/microphone/Microphone.java | 7 +- .../speech/microphone/MicrophoneAnalyzer.java | 38 ++-- .../recognizer/vad/VoiceActivityDetector.java | 177 +++++++++++++----- .../recognizer/vad/VoiceActivityListener.java | 7 + 4 files changed, 163 insertions(+), 66 deletions(-) create mode 100644 src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityListener.java diff --git a/src/main/java/com/darkprograms/speech/microphone/Microphone.java b/src/main/java/com/darkprograms/speech/microphone/Microphone.java index 7ec1425..bb3a6b9 100644 --- a/src/main/java/com/darkprograms/speech/microphone/Microphone.java +++ b/src/main/java/com/darkprograms/speech/microphone/Microphone.java @@ -41,6 +41,7 @@ public enum CaptureState { private File audioFile; private AudioInputStream audioStream; + private float sampleRate; /** * Constructor @@ -102,11 +103,11 @@ public void setTargetDataLine(TargetDataLine targetDataLine) { /** * Initializes the target data line. */ - private void initTargetDataLine() { initTargetDataLine(8_000F); } private void initTargetDataLine(float sampleRate) { + this.sampleRate = sampleRate; DataLine.Info dataLineInfo = new DataLine.Info(TargetDataLine.class, getAudioFormat()); try { setTargetDataLine((TargetDataLine) AudioSystem.getLine(dataLineInfo)); @@ -115,7 +116,6 @@ private void initTargetDataLine(float sampleRate) { e.printStackTrace(); return; } - } /** @@ -130,6 +130,7 @@ public AudioInputStream captureAudioToStream(float sampleRate) { //Get Audio // new Thread(new ListenThread()).start(); open(); + this.sampleRate = sampleRate; audioStream = new AudioInputStream(getTargetDataLine()); return audioStream; } @@ -170,7 +171,7 @@ public void captureAudioToFile(String audioFile) throws LineUnavailableException * @return Returns AudioFormat to be used later when capturing audio from microphone */ public AudioFormat getAudioFormat() { - return getAudioFormat(8000.0F); + return getAudioFormat(sampleRate); } /** diff --git a/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java b/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java index e757a89..e26a068 100644 --- a/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java +++ b/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java @@ -1,6 +1,8 @@ package com.darkprograms.speech.microphone; import javax.sound.sampled.AudioFileFormat; +import javax.sound.sampled.AudioFormat; + import com.darkprograms.speech.util.*; /******************************************************************************************** @@ -85,8 +87,14 @@ public int getNumOfBytes(int seconds){ * @param seconds The length in seconds * @return the number of bytes the microphone will output over the specified time. */ - public int getNumOfBytes(double seconds){ - return (int)(seconds*getAudioFormat().getSampleRate()*getAudioFormat().getFrameSize()+.5); + public int getNumOfBytes(double seconds) { + AudioFormat format = getAudioFormat(); + return (int)(seconds * format.getSampleRate() * format.getFrameSize() + .5); + } + + public int getNumOfFrames(int bytes) { + AudioFormat format = getAudioFormat(); + return bytes / format.getFrameSize(); } /** @@ -94,8 +102,8 @@ public int getNumOfBytes(double seconds){ * @param numOfBytes The length of the returned array. * @return The specified array or null if it cannot. */ - private byte[] getBytes(int numOfBytes){ - if(getTargetDataLine()!=null){ + private byte[] getBytes(int numOfBytes) { + if (getTargetDataLine()!=null) { byte[] data = new byte[numOfBytes]; this.getTargetDataLine().read(data, 0, numOfBytes); return data; @@ -154,7 +162,7 @@ public int getFrequency(byte[] bytes){ * Applies a Hanning Window to the data set. * Hanning Windows are used to increase the accuracy of the FFT. * One should always apply a window to a dataset before applying an FFT - * @param The data you want to apply the window to + * @param data The data you want to apply the window to * @return The windowed data set */ private double[] applyHanningWindow(double[] data){ @@ -165,9 +173,9 @@ private double[] applyHanningWindow(double[] data){ * Applies a Hanning Window to the data set. * Hanning Windows are used to increase the accuracy of the FFT. * One should always apply a window to a dataset before applying an FFT - * @param The data you want to apply the window to - * @param The starting index you want to apply a window from - * @param The size of the window + * @param signal_in The data you want to apply the window to + * @param pos The starting index you want to apply a window from + * @param size The size of the window * @return The windowed data set */ private double[] applyHanningWindow(double[] signal_in, int pos, int size){ @@ -191,20 +199,20 @@ private double[] applyHanningWindow(double[] signal_in, int pos, int size){ * @return The fundamental frequency in Hertz */ private int calculateFundamentalFrequency(Complex[] fftData, int N){ - if(N<=0 || fftData == null){ return -1; } //error case + if (N <= 0 || fftData == null) { return -1; } //error case final int LENGTH = fftData.length;//Used to calculate bin size fftData = removeNegativeFrequencies(fftData); Complex[][] data = new Complex[N][fftData.length/N]; - for(int i = 0; i= threshE) then ++Counter - // • if ((F(i) - minF) >= threshF) then ++Counter - // • if ((SFM(i) - minSF) >= threshSF) then ++Counter - // 3-6- If Counter > mark the current frame as speech else mark it as silence. - // 3-7- If current frame is marked as silence, update the energy minimum value: - // minE = (silenceCount * minE) + E(i)) / (silenceCount + 1) - // 3-8- threshE = energyPrimThresh * log(minE) - // 4- Ignore silence run less than 10 successive frames. - // 5- Ignore speech run less than 5 successive frames. +public class VoiceActivityDetector implements Runnable { + private static final int WINDOW_MILLIS = 10; + private static final double WINDOW_SECONDS = WINDOW_MILLIS / 1000; + private static final int IGNORE_SILENCE_WINDOWS = 10; + private static final int IGNORE_SPEECH_WINDOWS = 5; + /** maximum ms between words */ + private static final int MAX_SILENCE_MILLIS = 4; + /** minimum duration of speech to recognise */ + private static final int MIN_SPEECH_MILLIS = 200; + private static final int MAX_SPEECH_MILLIS = 60_000; + private static final int MAX_SILENCE_WINDOWS = MAX_SILENCE_MILLIS / WINDOW_MILLIS; + private static final int MIN_SPEECH_WINDOWS = MIN_SPEECH_MILLIS / WINDOW_MILLIS; + private static final int MAX_SPEECH_WINDOWS = MAX_SPEECH_MILLIS / WINDOW_MILLIS; + private static final int ENERGY_PRIMARY_THRESHOLD = 40; + private static final int FREQUENCY_PRIMARY_THRESHOLD = 185; + private static final int SPECTRAL_FLATNESS_PRIMARY_THRESHOLD = 5; + private AudioInputStream audio; + private MicrophoneAnalyzer mic; + private VoiceActivityListener listener; + private VadState state; + + private enum VadState { + LISTENING, + DETECTED_SPEECH, + DETECTED_SILENCE_AFTER_SPEECH + } + +// public void detectVoiceActivity(AudioInputStream audio) { + public void detectVoiceActivity(MicrophoneAnalyzer mic, VoiceActivityListener listener) { + this.listener = listener; + this.mic = mic; + this.audio = mic.captureAudioToStream(16_000F); + new Thread(this).start(); + } + + public void run() { + byte[] audioData = new byte[mic.getNumOfBytes(WINDOW_SECONDS)]; + int offset = 0; + int bufferSize = MAX_SPEECH_MILLIS * this.mic.getNumOfBytes(0.001); + int silenceCount = 0; + int speechCount = 0; + int minEnergy = Integer.MAX_VALUE; + int minFrequency = Integer.MAX_VALUE; + int minSpectralFlatness = Integer.MAX_VALUE; + ByteArrayOutputStream outBuffer = new ByteArrayOutputStream(bufferSize); + + state = VadState.LISTENING; + + while (true) { + try { + int bytesRead = this.audio.read(audioData); + + int energy = mic.calculateRMSLevel(audioData); + int frequency = mic.getFrequency(audioData); + // 3-2-2- Compute the abstract value of Spectral Flatness Measure SFM(i) +// TODO https://github.com/filipeuva/SoundBites/blob/master/src/uk/co/biogen/SoundBites/analysis/AnalysisInterface.java#L264 + + // 3-3- Supposing that some of the first 30 frames are silence, find the minimum value for E, F & SF + minEnergy = Math.min(minEnergy, energy); + minFrequency = Math.min(minFrequency, frequency); +// minSpectralFlatness = Math.min(minSpectralFlatness, energy); + + double energyThreshold = ENERGY_PRIMARY_THRESHOLD * Math.log(minEnergy); + + int counter = 0; + if (energy - minEnergy >= energyThreshold) counter++; + if (frequency - minFrequency >= FREQUENCY_PRIMARY_THRESHOLD) counter++; + if (sfm - minSpectralFlatness) >= SPECTRAL_FLATNESS_PRIMARY_THRESHOLD) counter++; + + if (counter > 1) { + // speech + speechCount++; + // Ignore speech runs less than 5 successive frames. + if (state != VadState.DETECTED_SPEECH && speechCount >= IGNORE_SPEECH_WINDOWS) { + state = VadState.DETECTED_SPEECH; + silenceCount = 0; + } + + if (offset + bytesRead < bufferSize) { + outBuffer.write(audioData, offset, bytesRead); + offset += bytesRead; + + if (speechCount >= MAX_SPEECH_WINDOWS) { + // in theory, this should be handled by the following end of buffer handler + emitVoiceActivity(outBuffer); + offset = 0; + } + } else { + // Reached the end of the buffer! Send what we've captured so far + bytesRead = bufferSize - offset; + outBuffer.write(audioData, offset, bytesRead); + emitVoiceActivity(outBuffer); + offset = 0; + } + } else { + // silence + silenceCount++; + minEnergy = ((silenceCount * minEnergy) + energy) / (silenceCount + 1); + // Ignore silence runs less than 10 successive frames. + if (state == VadState.DETECTED_SPEECH && silenceCount >= IGNORE_SILENCE_WINDOWS) { + if (speechCount > MIN_SPEECH_WINDOWS) { + // We have silence after a chunk of speech worth processing + emitVoiceActivity(outBuffer); + offset = 0; + } + + state = VadState.DETECTED_SILENCE_AFTER_SPEECH; + speechCount = 0; + } + } + } catch (IOException e) { + e.printStackTrace(); + return; + } + } + } + + private void emitVoiceActivity(ByteArrayOutputStream outBuffer) { + listener.onVoiceeActivity(createVoiceActivityStream(outBuffer)); + outBuffer.reset(); + state = VadState.LISTENING; + } + + private AudioInputStream createVoiceActivityStream(ByteArrayOutputStream outBuffer) { + return new AudioInputStream(new ByteArrayInputStream(outBuffer.toByteArray()), audio.getFormat(), mic.getNumOfFrames(outBuffer.size())); } } diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityListener.java b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityListener.java new file mode 100644 index 0000000..3258424 --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityListener.java @@ -0,0 +1,7 @@ +package com.darkprograms.speech.recognizer.vad; + +import javax.sound.sampled.AudioInputStream; + +public interface VoiceActivityListener { + public void onVoiceeActivity(AudioInputStream audioInputStream); +} From 8b80b572e8fb2ccd5b39e67aea01fc56ca83831e Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Thu, 13 Jul 2017 02:40:01 +1000 Subject: [PATCH 06/20] VAD ready for test --- .../speech/microphone/MicrophoneAnalyzer.java | 71 ++++++++++++++++++- .../recognizer/vad/VoiceActivityDetector.java | 26 ++++--- 2 files changed, 85 insertions(+), 12 deletions(-) diff --git a/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java b/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java index e26a068..e72baf7 100644 --- a/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java +++ b/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java @@ -157,6 +157,76 @@ public int getFrequency(byte[] bytes){ Complex[] fftTransformed = FFT.fft(complex); return this.calculateFundamentalFrequency(fftTransformed, 4); } + +/* *//** + * borrowed from http://www.programcreek.com/java-api-examples/index.php?source_dir=Audio-Descriptors-master/src/audio/descriptors/AudioDescriptor.java + * + * Spectral flatness provides a way to quantify how tone-like a sound is, as opposed to + * being noise-like. The meaning of tonal in this context is in the sense of the amount of peaks + * or resonant structure in a power spectrum, as opposed to flat spectrum of a white noise. + * + * @param bytes + * @return Spectral Flatness coefficient + *//* + public int calculateSpectralFlatness(byte[] bytes) { + // compute FFT + FFT f = new FFT(x.bufferSize(), x.sampleRate()); + f.window(FFT.HAMMING); + f.forward(x.right); + + float num = 1; + float den = 0; + float Si = 0; + float asf = 0; // result + + final int n = -8; + final int B = 24; // number of bands + final float loF = (float) (Math.pow(2, n/4.0) * 1000); // lowest frequency [Hz] + final float hiF = (float) (Math.pow(2, B/4.0) * loF); // highest frequency [Hz] + final int loK = f.freqToIndex(loF); + final int hiK = f.freqToIndex(hiF); + final float reduceFactor = hiK - loK + 1; + + for(int k=loK; k= energyThreshold) counter++; - if (frequency - minFrequency >= FREQUENCY_PRIMARY_THRESHOLD) counter++; - if (sfm - minSpectralFlatness) >= SPECTRAL_FLATNESS_PRIMARY_THRESHOLD) counter++; + if (energy - minEnergy >= energyThreshold) counter++; + if (frequency - minFrequency >= FREQUENCY_PRIMARY_THRESHOLD) counter++; +// if (sfm - minSpectralFlatness) >= SPECTRAL_FLATNESS_PRIMARY_THRESHOLD) counter++; + } if (counter > 1) { // speech @@ -114,7 +118,7 @@ public void run() { minEnergy = ((silenceCount * minEnergy) + energy) / (silenceCount + 1); // Ignore silence runs less than 10 successive frames. if (state == VadState.DETECTED_SPEECH && silenceCount >= IGNORE_SILENCE_WINDOWS) { - if (speechCount > MIN_SPEECH_WINDOWS) { + if (silenceCount >= MAX_SILENCE_WINDOWS && speechCount >= MIN_SPEECH_WINDOWS) { // We have silence after a chunk of speech worth processing emitVoiceActivity(outBuffer); offset = 0; From c8d616e10a8b0ce0df18c804858c526adeaa0e0d Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Thu, 13 Jul 2017 03:14:31 +1000 Subject: [PATCH 07/20] Inital AWS Lex implementation --- pom.xml | 5 +++ .../speech/recognizer/RecognitionResult.java | 7 ++++ .../speech/recognizer/SpeechRecogniser.java | 4 ++ .../recognizer/awslex/LexRecogniser.java | 42 +++++++++++++++++++ .../speech/recognizer/awslex/LexResponse.java | 38 +++++++++++++++++ .../recognizer/google/GoogleResponse.java | 4 +- .../speech/recognizer/google/Recognizer.java | 3 +- 7 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 src/main/java/com/darkprograms/speech/recognizer/RecognitionResult.java create mode 100644 src/main/java/com/darkprograms/speech/recognizer/SpeechRecogniser.java create mode 100644 src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecogniser.java create mode 100644 src/main/java/com/darkprograms/speech/recognizer/awslex/LexResponse.java diff --git a/pom.xml b/pom.xml index 0c80b1c..dd44b65 100644 --- a/pom.xml +++ b/pom.xml @@ -82,6 +82,11 @@ json 20150729 + + com.amazonaws + aws-java-sdk-lex + 1.11.160 + diff --git a/src/main/java/com/darkprograms/speech/recognizer/RecognitionResult.java b/src/main/java/com/darkprograms/speech/recognizer/RecognitionResult.java new file mode 100644 index 0000000..134aaae --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/RecognitionResult.java @@ -0,0 +1,7 @@ +package com.darkprograms.speech.recognizer; + +public interface RecognitionResult { + /** @return String representation of what was said */ + String getResponse(); + boolean isFinalResponse(); +} diff --git a/src/main/java/com/darkprograms/speech/recognizer/SpeechRecogniser.java b/src/main/java/com/darkprograms/speech/recognizer/SpeechRecogniser.java new file mode 100644 index 0000000..48b2e81 --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/SpeechRecogniser.java @@ -0,0 +1,4 @@ +package com.darkprograms.speech.recognizer; + +public interface SpeechRecogniser { +} diff --git a/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecogniser.java b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecogniser.java new file mode 100644 index 0000000..ceb27d5 --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecogniser.java @@ -0,0 +1,42 @@ +package com.darkprograms.speech.recognizer.awslex; + +import com.amazonaws.services.lexruntime.AmazonLexRuntime; +import com.amazonaws.services.lexruntime.AmazonLexRuntimeClientBuilder; +import com.amazonaws.services.lexruntime.model.PostContentRequest; +import com.amazonaws.services.lexruntime.model.PostContentResult; +import com.amazonaws.services.lexruntime.model.PostTextRequest; +import com.darkprograms.speech.recognizer.RecognitionResult; + +import javax.sound.sampled.AudioInputStream; +import java.util.Map; + +public class LexRecogniser { + private String userId; + private AmazonLexRuntime lex; + + LexRecogniser(String userId) { + this.userId = userId; + lex = AmazonLexRuntimeClientBuilder.standard() +// .withClientConfiguration() +// .withCredentials() +// .withRegion() + .build(); + } + + /** + * @param stream + * @param sessionAttributes The value must be map (keys and values must be strings) that is JSON serialized and then base64 encoded + * @return + */ + public RecognitionResult getRecognizedDataForStream(AudioInputStream stream, String sessionAttributes) { + PostContentRequest request = new PostContentRequest() + .withBotName("idear") + .withBotAlias("PROD") + .withInputStream(stream) + .withUserId(userId) + .withSessionAttributes(sessionAttributes); + + PostContentResult result = lex.postContent(request); + return new LexResponse(result); + } +} diff --git a/src/main/java/com/darkprograms/speech/recognizer/awslex/LexResponse.java b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexResponse.java new file mode 100644 index 0000000..f987095 --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexResponse.java @@ -0,0 +1,38 @@ +package com.darkprograms.speech.recognizer.awslex; + +import com.amazonaws.services.lexruntime.model.PostContentResult; +import com.darkprograms.speech.recognizer.RecognitionResult; + +public class LexResponse implements RecognitionResult { + private PostContentResult result; +// private String response; + + public LexResponse(PostContentResult result) { + this.result = result; +// this.response = result.getInputTranscript(); + + // Close - Fulfilled or Failed (ReadyForFulfillment?) + // Incomplete - ElicitIntent, ConfirmIntent, ElicitSlot +// result.getDialogState(); +// result.getIntentName(); +// result.getMessage(); +// result.getSessionAttributes(); +// result.getSlots(); +// result.getSlotToElicit(); +// No card?!!! + } + + public String getResponse() { + return result.getInputTranscript(); +// return null; + } + + public boolean isFinalResponse() { + String state = result.getDialogState(); + return "Fulfilled".equals(state) || "Failed".equals(state); + } + + public PostContentResult getResult() { + return result; + } +} diff --git a/src/main/java/com/darkprograms/speech/recognizer/google/GoogleResponse.java b/src/main/java/com/darkprograms/speech/recognizer/google/GoogleResponse.java index d04dd37..666dadc 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/google/GoogleResponse.java +++ b/src/main/java/com/darkprograms/speech/recognizer/google/GoogleResponse.java @@ -1,5 +1,7 @@ package com.darkprograms.speech.recognizer.google; +import com.darkprograms.speech.recognizer.RecognitionResult; + import java.util.ArrayList; import java.util.List; @@ -8,7 +10,7 @@ * * @author Luke Kuza, Duncan Jauncey, Aaron Gokaslan ******************************************************************************/ -public class GoogleResponse { +public class GoogleResponse implements RecognitionResult { /** * Variable that holds the response diff --git a/src/main/java/com/darkprograms/speech/recognizer/google/Recognizer.java b/src/main/java/com/darkprograms/speech/recognizer/google/Recognizer.java index 18d6fa0..311f3fc 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/google/Recognizer.java +++ b/src/main/java/com/darkprograms/speech/recognizer/google/Recognizer.java @@ -6,6 +6,7 @@ import java.net.URLConnection; import java.nio.charset.Charset; +import com.darkprograms.speech.recognizer.SpeechRecogniser; import org.json.*; import com.darkprograms.speech.encoding.FlacEncoder; @@ -15,7 +16,7 @@ * * @author Luke Kuza, Duncan Jauncey, Aaron Gokaslan **************************************************************/ -public class Recognizer { +public class Recognizer implements SpeechRecogniser { public enum Languages{ AUTO_DETECT("auto"),//tells Google to auto-detect the language From 0f77d0cb80a01c5649ad4043df7f42cfe51e688e Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Thu, 13 Jul 2017 21:57:13 +1000 Subject: [PATCH 08/20] more ways of providing sessionAttributes, application provides LexRuntime --- .../recognizer/awslex/LexRecogniser.java | 103 ++++++++++++++++-- 1 file changed, 92 insertions(+), 11 deletions(-) diff --git a/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecogniser.java b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecogniser.java index ceb27d5..5dcda27 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecogniser.java +++ b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecogniser.java @@ -1,26 +1,107 @@ package com.darkprograms.speech.recognizer.awslex; import com.amazonaws.services.lexruntime.AmazonLexRuntime; -import com.amazonaws.services.lexruntime.AmazonLexRuntimeClientBuilder; import com.amazonaws.services.lexruntime.model.PostContentRequest; import com.amazonaws.services.lexruntime.model.PostContentResult; -import com.amazonaws.services.lexruntime.model.PostTextRequest; import com.darkprograms.speech.recognizer.RecognitionResult; +import org.json.JSONObject; import javax.sound.sampled.AudioInputStream; +import java.util.Base64; import java.util.Map; +/** + * example: + *
+ LexRecogniser lex = new LexRecogniser(AmazonLexRuntimeClientBuilder.defaultClient(), "MyLexBot", "PROD", "auser");
+ MicrophoneAnalyzer mic = new MicrophoneAnalyzer(null);
+ VoiceActivityDetector vad = new VoiceActivityDetector();
+
+ vad.detectVoiceActivity(mic, audioInputStream -> {
+    PostContentResult result = lex.getRecognizedDataForStream(audioInputStream, myApp.getSessionAttributes()).getResult();
+    System.out.println(result.message);
+ });
+ * 
+ */ public class LexRecogniser { - private String userId; private AmazonLexRuntime lex; + private String botName; + private String botAlias; + private String userId; - LexRecogniser(String userId) { + public LexRecogniser(AmazonLexRuntime lex, String botName, String botAlias, String userId) { + this.lex = lex; + this.botName = botName; + this.botAlias = botAlias; this.userId = userId; - lex = AmazonLexRuntimeClientBuilder.standard() -// .withClientConfiguration() -// .withCredentials() -// .withRegion() - .build(); + } + + public RecognitionResult getRecognizedDataForStream(AudioInputStream stream, Map sessionAttributes) { + String json; + if (sessionAttributes == null || sessionAttributes.isEmpty()) { + json = null; + } else { + StringBuilder str = null; + + for (Map.Entry entry : sessionAttributes.entrySet()) { + String key = entry.getKey(); + String value = entry.getValue(); + + if (str == null) { + str = new StringBuilder("{"); + } else { + str.append(","); + } + + str.append("\"").append(key).append("\":"); + if (value == null) { + str.append("null"); + } else { + str.append(JSONObject.quote(value)); + } + } + + json = str.append("}").toString(); + } + + return getRecognizedDataForStream(stream, new String(Base64.getEncoder().encode(json.getBytes()))); + } + + public RecognitionResult getRecognizedDataForStreamWithObjects(AudioInputStream stream, Map sessionAttributes) { + String json; + if (sessionAttributes == null || sessionAttributes.isEmpty()) { + json = null; + } else { + StringBuilder str = null; + + for (Map.Entry entry : sessionAttributes.entrySet()) { + String key = entry.getKey(); + Object value = entry.getValue(); + + if (str == null) { + str = new StringBuilder("{"); + } else { + str.append(","); + } + + str.append("\"").append(key).append("\":"); +// if (value == null) { +// str.append("null"); +// } else if (value instanceof Number) { +// str.append(JSONObject.numberToString((Number)value)); +// } else if (value instanceof Boolean) { +// str.append(((Boolean)value).toString()); +// } else if (value instanceof String) { +// str.append("\"").append((String)value).append("\""); +// } else { + str.append(JSONObject.valueToString(value)); +// } + } + + json = str.append("}").toString(); + } + + return getRecognizedDataForStream(stream, new String(Base64.getEncoder().encode(json.getBytes()))); } /** @@ -30,8 +111,8 @@ public class LexRecogniser { */ public RecognitionResult getRecognizedDataForStream(AudioInputStream stream, String sessionAttributes) { PostContentRequest request = new PostContentRequest() - .withBotName("idear") - .withBotAlias("PROD") + .withBotName(botName) + .withBotAlias(botAlias) .withInputStream(stream) .withUserId(userId) .withSessionAttributes(sessionAttributes); From 62389ea35aea587d7054991686aee2a69b3f06be Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Thu, 13 Jul 2017 21:57:54 +1000 Subject: [PATCH 09/20] updated documentation, bumped to v 2.0.0-SNAPSHOT (breaking changes) --- README.markdown | 1 + pom.xml | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/README.markdown b/README.markdown index e6bfcf7..9c48174 100644 --- a/README.markdown +++ b/README.markdown @@ -11,6 +11,7 @@ The J.A.R.V.I.S. Speech API is designed to be simple and efficient, using the sp The API currently provides the following functionality, * Microphone Capture API (Wrapped around the current Java API for simplicity) + * Voice Activity Detector * A speech recognizer using Google's recognizer service * Converts WAVE files from microphone input to FLAC (using existing API, see CREDITS) * Retrieves Response from Google, including confidence score and text diff --git a/pom.xml b/pom.xml index dd44b65..d750e70 100644 --- a/pom.xml +++ b/pom.xml @@ -4,11 +4,13 @@ com.darkprograms.speech java-speech-api - 1.13.0-SNAPSHOT + 2.0.0-SNAPSHOT jar UTF-8 + 1.8 + 1.8 java-speech-api @@ -69,6 +71,10 @@ https://github.com/AranHase AranHase + + https://github.com/nalbion + nalbion + From eba6ebdc6537e0df85db5a2c5dc0ee3975e4e0f0 Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Thu, 13 Jul 2017 22:32:19 +1000 Subject: [PATCH 10/20] added alternate Microphone constructor --- .../com/darkprograms/speech/microphone/Microphone.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/main/java/com/darkprograms/speech/microphone/Microphone.java b/src/main/java/com/darkprograms/speech/microphone/Microphone.java index bb3a6b9..07e14d5 100644 --- a/src/main/java/com/darkprograms/speech/microphone/Microphone.java +++ b/src/main/java/com/darkprograms/speech/microphone/Microphone.java @@ -55,6 +55,15 @@ public Microphone(AudioFileFormat.Type fileType) { initTargetDataLine(); } + /** + * Constructor for use with {@link #captureAudioToStream(float sampleRate)} + * @param sampleRate + */ + public Microphone(float sampleRate) { + setState(CaptureState.CLOSED); + initTargetDataLine(sampleRate); + } + /** * Gets the current state of Microphone * From 4fe5192b918714511247ea7e1ff1027158172223 Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Thu, 13 Jul 2017 22:35:01 +1000 Subject: [PATCH 11/20] added alternate Microphone constructor --- .../com/darkprograms/speech/microphone/Microphone.java | 2 +- .../speech/microphone/MicrophoneAnalyzer.java | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/darkprograms/speech/microphone/Microphone.java b/src/main/java/com/darkprograms/speech/microphone/Microphone.java index 07e14d5..1424a50 100644 --- a/src/main/java/com/darkprograms/speech/microphone/Microphone.java +++ b/src/main/java/com/darkprograms/speech/microphone/Microphone.java @@ -57,7 +57,7 @@ public Microphone(AudioFileFormat.Type fileType) { /** * Constructor for use with {@link #captureAudioToStream(float sampleRate)} - * @param sampleRate + * @param sampleRate samples per second - 16_000 (recommended) or 8_000 */ public Microphone(float sampleRate) { setState(CaptureState.CLOSED); diff --git a/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java b/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java index e72baf7..0bd3701 100644 --- a/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java +++ b/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java @@ -22,6 +22,14 @@ public class MicrophoneAnalyzer extends Microphone { public MicrophoneAnalyzer(AudioFileFormat.Type fileType){ super(fileType); } + + /** + * Constructor for use with {@link #captureAudioToStream(float sampleRate)} + * @param sampleRate samples per second - 16_000 (recommended) or 8_000 + */ + public MicrophoneAnalyzer(float sampleRate){ + super(sampleRate); + } /** * Gets the volume of the microphone input From 9615014af3acc7047708601a501b0f77d814e992 Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Fri, 14 Jul 2017 00:20:11 +1000 Subject: [PATCH 12/20] integrating with idear --- .../speech/microphone/Microphone.java | 9 +++----- .../recognizer/awslex/LexRecogniser.java | 22 ++++++++++++++++++- .../recognizer/vad/VoiceActivityDetector.java | 4 ++-- .../recognizer/vad/VoiceActivityListener.java | 2 +- 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/src/main/java/com/darkprograms/speech/microphone/Microphone.java b/src/main/java/com/darkprograms/speech/microphone/Microphone.java index 1424a50..7ed0ecd 100644 --- a/src/main/java/com/darkprograms/speech/microphone/Microphone.java +++ b/src/main/java/com/darkprograms/speech/microphone/Microphone.java @@ -56,7 +56,7 @@ public Microphone(AudioFileFormat.Type fileType) { } /** - * Constructor for use with {@link #captureAudioToStream(float sampleRate)} + * Constructor for use with {@link #captureAudioToStream()} * @param sampleRate samples per second - 16_000 (recommended) or 8_000 */ public Microphone(float sampleRate) { @@ -127,13 +127,10 @@ private void initTargetDataLine(float sampleRate) { } } - /** - * @param sampleRate recommend 16_000F or 8_000F - */ - public AudioInputStream captureAudioToStream(float sampleRate) { + public AudioInputStream captureAudioToStream() { setState(CaptureState.STARTING_CAPTURE); if(getTargetDataLine() == null){ - initTargetDataLine(sampleRate); + initTargetDataLine(); } //Get Audio diff --git a/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecogniser.java b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecogniser.java index 5dcda27..62775fc 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecogniser.java +++ b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecogniser.java @@ -36,6 +36,20 @@ public LexRecogniser(AmazonLexRuntime lex, String botName, String botAlias, Stri this.userId = userId; } + public void setUserId(String userId) { + this.userId = userId; + } + + public LexResponse getRecognizedDataForStream(AudioInputStream stream) { + return getRecognizedDataForStream(stream, (String)null); + } + + /** + * @see #getRecognizedDataForStreamWithObjects(AudioInputStream, Map) + * @param stream + * @param sessionAttributes simple key:value attributes + * @return + */ public RecognitionResult getRecognizedDataForStream(AudioInputStream stream, Map sessionAttributes) { String json; if (sessionAttributes == null || sessionAttributes.isEmpty()) { @@ -67,6 +81,12 @@ public RecognitionResult getRecognizedDataForStream(AudioInputStream stream, Map return getRecognizedDataForStream(stream, new String(Base64.getEncoder().encode(json.getBytes()))); } + /** + * Each value of sesssionAttributes will be converted to a String containing JSON + * @param stream + * @param sessionAttributes + * @return + */ public RecognitionResult getRecognizedDataForStreamWithObjects(AudioInputStream stream, Map sessionAttributes) { String json; if (sessionAttributes == null || sessionAttributes.isEmpty()) { @@ -109,7 +129,7 @@ public RecognitionResult getRecognizedDataForStreamWithObjects(AudioInputStream * @param sessionAttributes The value must be map (keys and values must be strings) that is JSON serialized and then base64 encoded * @return */ - public RecognitionResult getRecognizedDataForStream(AudioInputStream stream, String sessionAttributes) { + public LexResponse getRecognizedDataForStream(AudioInputStream stream, String sessionAttributes) { PostContentRequest request = new PostContentRequest() .withBotName(botName) .withBotAlias(botAlias) diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java index 8e90185..d30bcb3 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java @@ -45,7 +45,7 @@ private enum VadState { public void detectVoiceActivity(MicrophoneAnalyzer mic, VoiceActivityListener listener) { this.listener = listener; this.mic = mic; - this.audio = mic.captureAudioToStream(16_000F); + this.audio = mic.captureAudioToStream(); new Thread(this).start(); } @@ -136,7 +136,7 @@ public void run() { } private void emitVoiceActivity(ByteArrayOutputStream outBuffer) { - listener.onVoiceeActivity(createVoiceActivityStream(outBuffer)); + listener.onVoiceActivity(createVoiceActivityStream(outBuffer)); outBuffer.reset(); state = VadState.LISTENING; } diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityListener.java b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityListener.java index 3258424..ebd1fe9 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityListener.java +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityListener.java @@ -3,5 +3,5 @@ import javax.sound.sampled.AudioInputStream; public interface VoiceActivityListener { - public void onVoiceeActivity(AudioInputStream audioInputStream); + void onVoiceActivity(AudioInputStream audioInputStream); } From e3650e1b2b211850a34e7bf431044d942a4cc137 Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Fri, 14 Jul 2017 00:27:04 +1000 Subject: [PATCH 13/20] adopted US spelling --- .../{SpeechRecogniser.java => SpeechRecognizer.java} | 2 +- .../awslex/{LexRecogniser.java => LexRecognizer.java} | 6 +++--- .../darkprograms/speech/recognizer/google/Recognizer.java | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) rename src/main/java/com/darkprograms/speech/recognizer/{SpeechRecogniser.java => SpeechRecognizer.java} (56%) rename src/main/java/com/darkprograms/speech/recognizer/awslex/{LexRecogniser.java => LexRecognizer.java} (96%) diff --git a/src/main/java/com/darkprograms/speech/recognizer/SpeechRecogniser.java b/src/main/java/com/darkprograms/speech/recognizer/SpeechRecognizer.java similarity index 56% rename from src/main/java/com/darkprograms/speech/recognizer/SpeechRecogniser.java rename to src/main/java/com/darkprograms/speech/recognizer/SpeechRecognizer.java index 48b2e81..67483f3 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/SpeechRecogniser.java +++ b/src/main/java/com/darkprograms/speech/recognizer/SpeechRecognizer.java @@ -1,4 +1,4 @@ package com.darkprograms.speech.recognizer; -public interface SpeechRecogniser { +public interface SpeechRecognizer { } diff --git a/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecogniser.java b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecognizer.java similarity index 96% rename from src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecogniser.java rename to src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecognizer.java index 62775fc..2fa7656 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecogniser.java +++ b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecognizer.java @@ -13,7 +13,7 @@ /** * example: *
- LexRecogniser lex = new LexRecogniser(AmazonLexRuntimeClientBuilder.defaultClient(), "MyLexBot", "PROD", "auser");
+ LexRecognizer lex = new LexRecognizer(AmazonLexRuntimeClientBuilder.defaultClient(), "MyLexBot", "PROD", "auser");
  MicrophoneAnalyzer mic = new MicrophoneAnalyzer(null);
  VoiceActivityDetector vad = new VoiceActivityDetector();
 
@@ -23,13 +23,13 @@
  });
  * 
*/ -public class LexRecogniser { +public class LexRecognizer { private AmazonLexRuntime lex; private String botName; private String botAlias; private String userId; - public LexRecogniser(AmazonLexRuntime lex, String botName, String botAlias, String userId) { + public LexRecognizer(AmazonLexRuntime lex, String botName, String botAlias, String userId) { this.lex = lex; this.botName = botName; this.botAlias = botAlias; diff --git a/src/main/java/com/darkprograms/speech/recognizer/google/Recognizer.java b/src/main/java/com/darkprograms/speech/recognizer/google/Recognizer.java index 311f3fc..431b724 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/google/Recognizer.java +++ b/src/main/java/com/darkprograms/speech/recognizer/google/Recognizer.java @@ -6,7 +6,7 @@ import java.net.URLConnection; import java.nio.charset.Charset; -import com.darkprograms.speech.recognizer.SpeechRecogniser; +import com.darkprograms.speech.recognizer.SpeechRecognizer; import org.json.*; import com.darkprograms.speech.encoding.FlacEncoder; @@ -16,7 +16,7 @@ * * @author Luke Kuza, Duncan Jauncey, Aaron Gokaslan **************************************************************/ -public class Recognizer implements SpeechRecogniser { +public class Recognizer implements SpeechRecognizer { public enum Languages{ AUTO_DETECT("auto"),//tells Google to auto-detect the language From f7f3568f128573003e2863ccecfd41c251dbc1b8 Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Fri, 14 Jul 2017 01:14:32 +1000 Subject: [PATCH 14/20] fixed divide by zero, added thread name --- .../speech/recognizer/vad/VoiceActivityDetector.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java index d30bcb3..b235ea4 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java @@ -15,7 +15,7 @@ */ public class VoiceActivityDetector implements Runnable { private static final int WINDOW_MILLIS = 10; - private static final double WINDOW_SECONDS = WINDOW_MILLIS / 1000; + private static final double WINDOW_SECONDS = (double)WINDOW_MILLIS / 1000; private static final int IGNORE_SILENCE_WINDOWS = 10; private static final int IGNORE_SPEECH_WINDOWS = 5; /** maximum ms between words */ @@ -46,7 +46,7 @@ public void detectVoiceActivity(MicrophoneAnalyzer mic, VoiceActivityListener li this.listener = listener; this.mic = mic; this.audio = mic.captureAudioToStream(); - new Thread(this).start(); + new Thread(this, "JARVIS-VAD").start(); } public void run() { From 5d94bbf7eb6e01e109680de78d23acb00436c513 Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Fri, 14 Jul 2017 03:01:47 +1000 Subject: [PATCH 15/20] I'm sure all of this is wrong --- .../darkprograms/speech/microphone/MicrophoneAnalyzer.java | 7 ++++--- .../speech/recognizer/vad/VoiceActivityDetector.java | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java b/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java index 0bd3701..2519201 100644 --- a/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java +++ b/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java @@ -333,7 +333,7 @@ private int findMaxMagnitude(Complex[] input){ //Calculates Maximum Magnitude of the array double max = Double.MIN_VALUE; int index = -1; - for(int i = 0; imax){ @@ -354,8 +354,9 @@ private double[] bytesToDoubleArray(byte[] bufferData){ final int bytesRecorded = bufferData.length; final int bytesPerSample = getAudioFormat().getSampleSizeInBits()/8; final double amplification = 100.0; // choose a number as you like - double[] micBufferData = new double[bytesRecorded - bytesPerSample +1]; - for (int index = 0, floatIndex = 0; index < bytesRecorded - bytesPerSample + 1; index += bytesPerSample, floatIndex++) { + int micBufferLength = bytesRecorded; // bytesRecorded - bytesPerSample +1 + double[] micBufferData = new double[micBufferLength]; + for (int index = 0, floatIndex = 0; index < micBufferLength; index += bytesPerSample, floatIndex++) { double sample = 0; for (int b = 0; b < bytesPerSample; b++) { int v = bufferData[index + b]; diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java index b235ea4..472ae93 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java @@ -14,7 +14,7 @@ * @see [https://github.com/Sciss/SpeechRecognitionHMM/blob/master/src/main/java/org/ioe/tprsa/audio/preProcessings/EndPointDetection.java] */ public class VoiceActivityDetector implements Runnable { - private static final int WINDOW_MILLIS = 10; + private static final int WINDOW_MILLIS = 8; private static final double WINDOW_SECONDS = (double)WINDOW_MILLIS / 1000; private static final int IGNORE_SILENCE_WINDOWS = 10; private static final int IGNORE_SPEECH_WINDOWS = 5; @@ -87,7 +87,7 @@ public void run() { // if (sfm - minSpectralFlatness) >= SPECTRAL_FLATNESS_PRIMARY_THRESHOLD) counter++; } - if (counter > 1) { + if (counter >= 0) { // speech speechCount++; // Ignore speech runs less than 5 successive frames. From ef5bc8e0d9f65c023a041d6433ce505a52d46659 Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Fri, 14 Jul 2017 03:39:50 +1000 Subject: [PATCH 16/20] debugging --- .../speech/recognizer/awslex/LexRecognizer.java | 5 ++++- .../speech/recognizer/vad/VoiceActivityDetector.java | 11 +++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecognizer.java b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecognizer.java index 2fa7656..063f3d8 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecognizer.java +++ b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecognizer.java @@ -133,10 +133,13 @@ public LexResponse getRecognizedDataForStream(AudioInputStream stream, String se PostContentRequest request = new PostContentRequest() .withBotName(botName) .withBotAlias(botAlias) - .withInputStream(stream) .withUserId(userId) + .withInputStream(stream) + .withContentType("audio/l16; rate=16000; channels=1") .withSessionAttributes(sessionAttributes); + System.out.println("sending request to Lex: " + request); + PostContentResult result = lex.postContent(request); return new LexResponse(result); } diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java index 472ae93..0420925 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java @@ -22,7 +22,7 @@ public class VoiceActivityDetector implements Runnable { private static final int MAX_SILENCE_MILLIS = 4; /** minimum duration of speech to recognise */ private static final int MIN_SPEECH_MILLIS = 200; - private static final int MAX_SPEECH_MILLIS = 60_000; + private static final int MAX_SPEECH_MILLIS = 10_000; private static final int MAX_SILENCE_WINDOWS = MAX_SILENCE_MILLIS / WINDOW_MILLIS; private static final int MIN_SPEECH_WINDOWS = MIN_SPEECH_MILLIS / WINDOW_MILLIS; private static final int MAX_SPEECH_WINDOWS = MAX_SPEECH_MILLIS / WINDOW_MILLIS; @@ -81,7 +81,7 @@ public void run() { // minSpectralFlatness = Math.min(minSpectralFlatness, energy); double energyThreshold = ENERGY_PRIMARY_THRESHOLD * Math.log(minEnergy); - +System.out.println("energy: " + energy + "\tfrequency:" + frequency); if (energy - minEnergy >= energyThreshold) counter++; if (frequency - minFrequency >= FREQUENCY_PRIMARY_THRESHOLD) counter++; // if (sfm - minSpectralFlatness) >= SPECTRAL_FLATNESS_PRIMARY_THRESHOLD) counter++; @@ -97,7 +97,8 @@ public void run() { } if (offset + bytesRead < bufferSize) { - outBuffer.write(audioData, offset, bytesRead); +System.out.println(" offset: " + offset + " \t bytesRead: " + bytesRead); + outBuffer.write(audioData, 0, bytesRead); offset += bytesRead; if (speechCount >= MAX_SPEECH_WINDOWS) { @@ -108,7 +109,7 @@ public void run() { } else { // Reached the end of the buffer! Send what we've captured so far bytesRead = bufferSize - offset; - outBuffer.write(audioData, offset, bytesRead); + outBuffer.write(audioData, 0, bytesRead); emitVoiceActivity(outBuffer); offset = 0; } @@ -121,6 +122,8 @@ public void run() { if (silenceCount >= MAX_SILENCE_WINDOWS && speechCount >= MIN_SPEECH_WINDOWS) { // We have silence after a chunk of speech worth processing emitVoiceActivity(outBuffer); + outBuffer.reset(); +//TODO: is offset needed? offset = 0; } From 2c182c55bd3016bd21519992c6eed32a5994728b Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Fri, 14 Jul 2017 21:58:32 +1000 Subject: [PATCH 17/20] Implemented Skylion's SimpleVAD --- .../speech/microphone/Microphone.java | 4 - .../speech/microphone/MicrophoneAnalyzer.java | 4 +- .../speech/recognizer/vad/AbstractVAD.java | 142 ++++++++++++++++++ .../vad/MoattarHomayounpourVAD.java | 59 ++++++++ .../speech/recognizer/vad/SimpleVAD.java | 37 +++++ .../recognizer/vad/VoiceActivityDetector.java | 139 +---------------- 6 files changed, 247 insertions(+), 138 deletions(-) create mode 100644 src/main/java/com/darkprograms/speech/recognizer/vad/AbstractVAD.java create mode 100644 src/main/java/com/darkprograms/speech/recognizer/vad/MoattarHomayounpourVAD.java create mode 100644 src/main/java/com/darkprograms/speech/recognizer/vad/SimpleVAD.java diff --git a/src/main/java/com/darkprograms/speech/microphone/Microphone.java b/src/main/java/com/darkprograms/speech/microphone/Microphone.java index 7ed0ecd..95d5faa 100644 --- a/src/main/java/com/darkprograms/speech/microphone/Microphone.java +++ b/src/main/java/com/darkprograms/speech/microphone/Microphone.java @@ -133,15 +133,11 @@ public AudioInputStream captureAudioToStream() { initTargetDataLine(); } - //Get Audio -// new Thread(new ListenThread()).start(); open(); - this.sampleRate = sampleRate; audioStream = new AudioInputStream(getTargetDataLine()); return audioStream; } - /** * Captures audio from the microphone and saves it a file * diff --git a/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java b/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java index 2519201..a064235 100644 --- a/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java +++ b/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java @@ -24,7 +24,7 @@ public MicrophoneAnalyzer(AudioFileFormat.Type fileType){ } /** - * Constructor for use with {@link #captureAudioToStream(float sampleRate)} + * Constructor for use with {@link #captureAudioToStream()} * @param sampleRate samples per second - 16_000 (recommended) or 8_000 */ public MicrophoneAnalyzer(float sampleRate){ @@ -126,7 +126,7 @@ private byte[] getBytes(int numOfBytes) { * be in error due to the complex nature of sound. This feature is in Beta * @return The frequency of the sound in Hertz. */ - public int getFrequency(){ + public int getFrequency() { try { return getFrequency(4096); } catch (Exception e) { diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/AbstractVAD.java b/src/main/java/com/darkprograms/speech/recognizer/vad/AbstractVAD.java new file mode 100644 index 0000000..e0d4f34 --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/AbstractVAD.java @@ -0,0 +1,142 @@ +package com.darkprograms.speech.recognizer.vad; + +import com.darkprograms.speech.microphone.MicrophoneAnalyzer; + +import javax.sound.sampled.AudioInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; + + +public abstract class AbstractVAD implements VoiceActivityDetector, Runnable { + private static final int WINDOW_MILLIS = 16; + private static final int IGNORE_SILENCE_WINDOWS = 10; + private static final int IGNORE_SPEECH_WINDOWS = 5; + /** maximum ms between words */ + private static final int MAX_SILENCE_MILLIS = 4; + /** minimum duration of speech to recognise */ + private static final int MIN_SPEECH_MILLIS = 200; + private static final double WINDOW_SECONDS = (double)WINDOW_MILLIS / 1000; + /** Google does not allow recordings over 1 minute, but 10 seconds should be ample */ + private static final int MAX_SPEECH_MILLIS = 10_000; + private static final int MAX_SILENCE_WINDOWS = MAX_SILENCE_MILLIS / WINDOW_MILLIS; + private static final int MIN_SPEECH_WINDOWS = MIN_SPEECH_MILLIS / WINDOW_MILLIS; + + protected AudioInputStream audio; + protected MicrophoneAnalyzer mic; + protected VoiceActivityListener listener; + protected VadState state; + + protected int maxSpeechMs; + protected int maxSpeechWindows; + protected int silenceCount; + protected int speechCount; + + private int offset; + private int bufferSize; + private ByteArrayOutputStream outBuffer; + + // TODO: optionally provide PipedInputStream to support streaming recogntion on Google + public void detectVoiceActivity(MicrophoneAnalyzer mic, VoiceActivityListener listener) { + detectVoiceActivity(mic, MAX_SPEECH_MILLIS, listener); + } + + /** Initialise the VAD and start a thread */ + public void detectVoiceActivity(MicrophoneAnalyzer mic, int maxSpeechMs, VoiceActivityListener listener) { + this.listener = listener; + this.mic = mic; + this.audio = mic.captureAudioToStream(); + this.maxSpeechMs = maxSpeechMs; + maxSpeechWindows = maxSpeechMs / WINDOW_MILLIS; + new Thread(this, "JARVIS-VAD").start(); + } + + /** + * Continuously reads "windows" of audio into a buffer and delegates to {@link #sampleForSpeech(byte[])} + * and {@link #incrementSpeechCounter(boolean, int, byte[])}. + * {@link #emitVoiceActivity(ByteArrayOutputStream)} will be called when an utterance has been captured. + */ + @Override + public void run() { + int bytesToRead = mic.getNumOfBytes(WINDOW_SECONDS); + byte[] audioData = new byte[bytesToRead]; + bufferSize = maxSpeechMs * this.mic.getNumOfBytes(0.001); + silenceCount = 0; + speechCount = 0; + offset = 0; + outBuffer = new ByteArrayOutputStream(bufferSize); + + state = VoiceActivityDetector.VadState.LISTENING; + + while (true) { + try { + int bytesRead = this.audio.read(audioData, 0, bytesToRead); + boolean speechDetected = sampleForSpeech(audioData); + incrementSpeechCounter(speechDetected, bytesRead, audioData); + } catch (Exception e) { + e.printStackTrace(); + state = VadState.CLOSED; + return; + } + } + } + + /** + * Executed from within the VAD thread + * @param audioData + * @return + */ + protected abstract boolean sampleForSpeech(byte[] audioData); + + protected void incrementSpeechCounter(boolean speechDetected, int bytesRead, byte[] audioData) { + if (speechDetected) { + speechCount++; + // Ignore speech runs less than 5 successive frames. + if (state != VoiceActivityDetector.VadState.DETECTED_SPEECH && speechCount >= IGNORE_SPEECH_WINDOWS) { + state = VoiceActivityDetector.VadState.DETECTED_SPEECH; + silenceCount = 0; + } + + if (offset + bytesRead < bufferSize) { + outBuffer.write(audioData, 0, bytesRead); + offset += bytesRead; + + if (speechCount >= maxSpeechWindows) { + // in theory, this should be handled by the following end of buffer handler + emitVoiceActivity(outBuffer); + } + } else { + // Reached the end of the buffer! Send what we've captured so far + bytesRead = bufferSize - offset; + outBuffer.write(audioData, 0, bytesRead); + emitVoiceActivity(outBuffer); + } + } else { + // silence + silenceCount++; + + // Ignore silence runs less than 10 successive frames. + if (state == VoiceActivityDetector.VadState.DETECTED_SPEECH && silenceCount >= IGNORE_SILENCE_WINDOWS) { + if (silenceCount >= MAX_SILENCE_WINDOWS && speechCount >= MIN_SPEECH_WINDOWS) { + // We have silence after a chunk of speech worth processing + emitVoiceActivity(outBuffer); + } else { + state = VoiceActivityDetector.VadState.DETECTED_SILENCE_AFTER_SPEECH; + } + + speechCount = 0; + } + } + } + + protected void emitVoiceActivity(ByteArrayOutputStream outBuffer) { + listener.onVoiceActivity(createVoiceActivityStream(outBuffer)); + outBuffer.reset(); + offset = 0; + state = VadState.LISTENING; + } + + protected AudioInputStream createVoiceActivityStream(ByteArrayOutputStream outBuffer) { + return new AudioInputStream(new ByteArrayInputStream(outBuffer.toByteArray()), audio.getFormat(), mic.getNumOfFrames(outBuffer.size())); + } + +} diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/MoattarHomayounpourVAD.java b/src/main/java/com/darkprograms/speech/recognizer/vad/MoattarHomayounpourVAD.java new file mode 100644 index 0000000..1f6a3bd --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/MoattarHomayounpourVAD.java @@ -0,0 +1,59 @@ +package com.darkprograms.speech.recognizer.vad; + +/** + * Implementation of [https://www.researchgate.net/publication/255667085_A_simple_but_efficient_real-time_voice_activity_detection_algorithm] + * + * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + * !! WARNING - this is not working correctly !! + * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + * + * TODO: need to calculate Spectral Flatness Measure + */ +public class MoattarHomayounpourVAD extends AbstractVAD { + private static final int ENERGY_PRIMARY_THRESHOLD = 40; + private static final int FREQUENCY_PRIMARY_THRESHOLD = 185; + private static final int SPECTRAL_FLATNESS_PRIMARY_THRESHOLD = 5; + + private int minEnergy = Integer.MAX_VALUE; + private int minFrequency = Integer.MAX_VALUE; + private int minSpectralFlatness = Integer.MAX_VALUE; + + @Override + public void run() { + minEnergy = Integer.MAX_VALUE; + minFrequency = Integer.MAX_VALUE; + minSpectralFlatness = Integer.MAX_VALUE; + super.run(); + } + + @Override + protected boolean sampleForSpeech(byte[] audioData) { + int counter = 0; + int energy = mic.calculateRMSLevel(audioData); + int frequency = mic.getFrequency(audioData); + + // ignore frequencies above 400hz (and below 50Hz?) + if (frequency < 400) { + // 3-2-2- Compute the abstract value of Spectral Flatness Measure SFM(i) +// TODO https://github.com/filipeuva/SoundBites/blob/master/src/uk/co/biogen/SoundBites/analysis/AnalysisInterface.java#L264 + + // 3-3- Supposing that some of the first 30 frames are silence, find the minimum value for E, F & SF + minEnergy = Math.min(minEnergy, energy); + minFrequency = Math.min(minFrequency, frequency); +// minSpectralFlatness = Math.min(minSpectralFlatness, energy); + + double energyThreshold = ENERGY_PRIMARY_THRESHOLD * Math.log(minEnergy); + System.out.println("energy: " + energy + "\tfrequency:" + frequency); + if (energy - minEnergy >= energyThreshold) counter++; + if (frequency - minFrequency >= FREQUENCY_PRIMARY_THRESHOLD) counter++; +// if (sfm - minSpectralFlatness) >= SPECTRAL_FLATNESS_PRIMARY_THRESHOLD) counter++; + } + + if(counter > 1) { + return true; + } else { + minEnergy = ((silenceCount * minEnergy) + energy) / (silenceCount + 1); + return false; + } + } +} diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/SimpleVAD.java b/src/main/java/com/darkprograms/speech/recognizer/vad/SimpleVAD.java new file mode 100644 index 0000000..39eed1d --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/SimpleVAD.java @@ -0,0 +1,37 @@ +package com.darkprograms.speech.recognizer.vad; + +/** + * Adapted from https://stackoverflow.com/questions/18815235/can-i-use-google-speech-recognition-api-in-my-desktop-application + */ +public class SimpleVAD extends AbstractVAD { + private int threshold = 10; + private int ambientVolume; + private int speakingVolume; + private boolean speaking; + + public void setThreshold(int threshold) { + this.threshold = threshold; + } + + @Override + public void run() { + speakingVolume = -2; + speaking = false; + ambientVolume = mic.getAudioVolume(); + super.run(); + } + + @Override + protected boolean sampleForSpeech(byte[] audioData) { + int volume = mic.calculateRMSLevel(audioData); +System.out.println(volume); + if (volume > ambientVolume + threshold) { + speakingVolume = volume; + speaking = true; + } + if (speaking && volume + threshold < speakingVolume) { + speaking = false; + } + return speaking; + } +} diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java index 0420925..011d8a1 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java @@ -9,142 +9,17 @@ import java.io.IOException; /** - * Implementation of [https://www.researchgate.net/publication/255667085_A_simple_but_efficient_real-time_voice_activity_detection_algorithm] - * * @see [https://github.com/Sciss/SpeechRecognitionHMM/blob/master/src/main/java/org/ioe/tprsa/audio/preProcessings/EndPointDetection.java] */ -public class VoiceActivityDetector implements Runnable { - private static final int WINDOW_MILLIS = 8; - private static final double WINDOW_SECONDS = (double)WINDOW_MILLIS / 1000; - private static final int IGNORE_SILENCE_WINDOWS = 10; - private static final int IGNORE_SPEECH_WINDOWS = 5; - /** maximum ms between words */ - private static final int MAX_SILENCE_MILLIS = 4; - /** minimum duration of speech to recognise */ - private static final int MIN_SPEECH_MILLIS = 200; - private static final int MAX_SPEECH_MILLIS = 10_000; - private static final int MAX_SILENCE_WINDOWS = MAX_SILENCE_MILLIS / WINDOW_MILLIS; - private static final int MIN_SPEECH_WINDOWS = MIN_SPEECH_MILLIS / WINDOW_MILLIS; - private static final int MAX_SPEECH_WINDOWS = MAX_SPEECH_MILLIS / WINDOW_MILLIS; - private static final int ENERGY_PRIMARY_THRESHOLD = 40; - private static final int FREQUENCY_PRIMARY_THRESHOLD = 185; - private static final int SPECTRAL_FLATNESS_PRIMARY_THRESHOLD = 5; - - private AudioInputStream audio; - private MicrophoneAnalyzer mic; - private VoiceActivityListener listener; - private VadState state; - - private enum VadState { +public interface VoiceActivityDetector { + enum VadState { LISTENING, DETECTED_SPEECH, - DETECTED_SILENCE_AFTER_SPEECH + DETECTED_SILENCE_AFTER_SPEECH, + CLOSED } - // TODO: optionally provide PipedInputStream to support streaming recogntion on Google - public void detectVoiceActivity(MicrophoneAnalyzer mic, VoiceActivityListener listener) { - this.listener = listener; - this.mic = mic; - this.audio = mic.captureAudioToStream(); - new Thread(this, "JARVIS-VAD").start(); - } - - public void run() { - byte[] audioData = new byte[mic.getNumOfBytes(WINDOW_SECONDS)]; - int offset = 0; - int bufferSize = MAX_SPEECH_MILLIS * this.mic.getNumOfBytes(0.001); - int silenceCount = 0; - int speechCount = 0; - int minEnergy = Integer.MAX_VALUE; - int minFrequency = Integer.MAX_VALUE; - int minSpectralFlatness = Integer.MAX_VALUE; - ByteArrayOutputStream outBuffer = new ByteArrayOutputStream(bufferSize); - - state = VadState.LISTENING; - - while (true) { - try { - int bytesRead = this.audio.read(audioData); - - int counter = 0; - int energy = mic.calculateRMSLevel(audioData); - int frequency = mic.getFrequency(audioData); - - // ignore frequencies above 400hz (and below 50Hz?) - if (frequency < 400) { - // 3-2-2- Compute the abstract value of Spectral Flatness Measure SFM(i) -// TODO https://github.com/filipeuva/SoundBites/blob/master/src/uk/co/biogen/SoundBites/analysis/AnalysisInterface.java#L264 - - // 3-3- Supposing that some of the first 30 frames are silence, find the minimum value for E, F & SF - minEnergy = Math.min(minEnergy, energy); - minFrequency = Math.min(minFrequency, frequency); -// minSpectralFlatness = Math.min(minSpectralFlatness, energy); - - double energyThreshold = ENERGY_PRIMARY_THRESHOLD * Math.log(minEnergy); -System.out.println("energy: " + energy + "\tfrequency:" + frequency); - if (energy - minEnergy >= energyThreshold) counter++; - if (frequency - minFrequency >= FREQUENCY_PRIMARY_THRESHOLD) counter++; -// if (sfm - minSpectralFlatness) >= SPECTRAL_FLATNESS_PRIMARY_THRESHOLD) counter++; - } - - if (counter >= 0) { - // speech - speechCount++; - // Ignore speech runs less than 5 successive frames. - if (state != VadState.DETECTED_SPEECH && speechCount >= IGNORE_SPEECH_WINDOWS) { - state = VadState.DETECTED_SPEECH; - silenceCount = 0; - } - - if (offset + bytesRead < bufferSize) { -System.out.println(" offset: " + offset + " \t bytesRead: " + bytesRead); - outBuffer.write(audioData, 0, bytesRead); - offset += bytesRead; - - if (speechCount >= MAX_SPEECH_WINDOWS) { - // in theory, this should be handled by the following end of buffer handler - emitVoiceActivity(outBuffer); - offset = 0; - } - } else { - // Reached the end of the buffer! Send what we've captured so far - bytesRead = bufferSize - offset; - outBuffer.write(audioData, 0, bytesRead); - emitVoiceActivity(outBuffer); - offset = 0; - } - } else { - // silence - silenceCount++; - minEnergy = ((silenceCount * minEnergy) + energy) / (silenceCount + 1); - // Ignore silence runs less than 10 successive frames. - if (state == VadState.DETECTED_SPEECH && silenceCount >= IGNORE_SILENCE_WINDOWS) { - if (silenceCount >= MAX_SILENCE_WINDOWS && speechCount >= MIN_SPEECH_WINDOWS) { - // We have silence after a chunk of speech worth processing - emitVoiceActivity(outBuffer); - outBuffer.reset(); -//TODO: is offset needed? - offset = 0; - } - - state = VadState.DETECTED_SILENCE_AFTER_SPEECH; - speechCount = 0; - } - } - } catch (IOException e) { - e.printStackTrace(); - return; - } - } - } - - private void emitVoiceActivity(ByteArrayOutputStream outBuffer) { - listener.onVoiceActivity(createVoiceActivityStream(outBuffer)); - outBuffer.reset(); - state = VadState.LISTENING; - } - - private AudioInputStream createVoiceActivityStream(ByteArrayOutputStream outBuffer) { - return new AudioInputStream(new ByteArrayInputStream(outBuffer.toByteArray()), audio.getFormat(), mic.getNumOfFrames(outBuffer.size())); - } + // TODO: optionally provide PipedInputStream to support streaming recognition on Google + void detectVoiceActivity(MicrophoneAnalyzer mic, VoiceActivityListener listener); + void detectVoiceActivity(MicrophoneAnalyzer mic, int maxSpeechMs, VoiceActivityListener listener); } From 715886e6e19accad7b8daf2c12fdf62dd7c50e80 Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Fri, 14 Jul 2017 23:19:56 +1000 Subject: [PATCH 18/20] Added RecordingListener --- .../speech/encoding/FlacEncoder.java | 2 - .../recognizer/vad/RecordingListener.java | 41 +++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 src/main/java/com/darkprograms/speech/recognizer/vad/RecordingListener.java diff --git a/src/main/java/com/darkprograms/speech/encoding/FlacEncoder.java b/src/main/java/com/darkprograms/speech/encoding/FlacEncoder.java index bf06c25..d3def56 100644 --- a/src/main/java/com/darkprograms/speech/encoding/FlacEncoder.java +++ b/src/main/java/com/darkprograms/speech/encoding/FlacEncoder.java @@ -32,8 +32,6 @@ public FlacEncoder() { * @param outputFile Output FLAC file */ public void convertWaveToFlac(File inputFile, File outputFile) { - - StreamConfiguration streamConfiguration = new StreamConfiguration(); streamConfiguration.setSampleRate(8000); streamConfiguration.setBitsPerSample(16); diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/RecordingListener.java b/src/main/java/com/darkprograms/speech/recognizer/vad/RecordingListener.java new file mode 100644 index 0000000..68f4d33 --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/RecordingListener.java @@ -0,0 +1,41 @@ +package com.darkprograms.speech.recognizer.vad; + +import javax.sound.sampled.AudioFileFormat; +import javax.sound.sampled.AudioInputStream; +import javax.sound.sampled.AudioSystem; +import java.io.File; +import java.io.IOException; +import java.util.Date; + +/** + * Useful for debugging & testing microphone + */ +public class RecordingListener implements VoiceActivityListener { + private VoiceActivityListener nextListener; + + @Override + public void onVoiceActivity(AudioInputStream audioInputStream) { + String fileName = new Date().toString() + ".wav"; + File out = new File("/temp", fileName); + + try { + System.out.println("Saving recoring to " + out.getAbsolutePath()); + AudioSystem.write(audioInputStream, AudioFileFormat.Type.WAVE, out); + } catch (IOException e) { + e.printStackTrace(); + } + + if (nextListener != null) { + nextListener.onVoiceActivity(audioInputStream); + } + } + + public RecordingListener withNextListener(VoiceActivityListener nextListener) { + this.nextListener = nextListener; + return this; + } + + public void setNextListener(VoiceActivityListener nextListener) { + this.nextListener = nextListener; + } +} From 657827cb636ca2df9641b88eb52aaeeb6e95927c Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Sat, 15 Jul 2017 19:29:58 +1000 Subject: [PATCH 19/20] logging & debugging --- .../speech/microphone/Microphone.java | 46 ++++++++++++++++--- .../recognizer/awslex/LexRecognizer.java | 4 +- .../speech/recognizer/vad/AbstractVAD.java | 16 +++---- .../recognizer/vad/RecordingListener.java | 2 +- .../speech/recognizer/vad/SimpleVAD.java | 2 +- 5 files changed, 52 insertions(+), 18 deletions(-) diff --git a/src/main/java/com/darkprograms/speech/microphone/Microphone.java b/src/main/java/com/darkprograms/speech/microphone/Microphone.java index 95d5faa..2748626 100644 --- a/src/main/java/com/darkprograms/speech/microphone/Microphone.java +++ b/src/main/java/com/darkprograms/speech/microphone/Microphone.java @@ -112,24 +112,26 @@ public void setTargetDataLine(TargetDataLine targetDataLine) { /** * Initializes the target data line. */ - private void initTargetDataLine() { - initTargetDataLine(8_000F); + private TargetDataLine initTargetDataLine() { + return initTargetDataLine(8_000F); } - private void initTargetDataLine(float sampleRate) { + private TargetDataLine initTargetDataLine(float sampleRate) { this.sampleRate = sampleRate; DataLine.Info dataLineInfo = new DataLine.Info(TargetDataLine.class, getAudioFormat()); try { - setTargetDataLine((TargetDataLine) AudioSystem.getLine(dataLineInfo)); + TargetDataLine targetDataLine = (TargetDataLine)AudioSystem.getLine(dataLineInfo); + setTargetDataLine(targetDataLine); + return targetDataLine; } catch (LineUnavailableException e) { // TODO Auto-generated catch block e.printStackTrace(); - return; + return null; } } public AudioInputStream captureAudioToStream() { setState(CaptureState.STARTING_CAPTURE); - if(getTargetDataLine() == null){ + if(getTargetDataLine() == null) { initTargetDataLine(); } @@ -200,10 +202,40 @@ public void open(){ if(getTargetDataLine()==null){ initTargetDataLine(); } - if(!getTargetDataLine().isOpen() && !getTargetDataLine().isRunning() && !getTargetDataLine().isActive()){ + TargetDataLine targetDataLine = getTargetDataLine(); + if(!targetDataLine.isOpen() && !targetDataLine.isRunning() && !targetDataLine.isActive()) { try { setState(CaptureState.PROCESSING_AUDIO); + + try { +System.out.println("???????????????????????????????????????????????????????????"); + System.out.println("???????????????????????????????????????????????????????????");System.out.println("???????????????????????????????????????????????????????????");System.out.println("???????????????????????????????????????????????????????????"); + System.out.println("???????????????????????????????????????????????????????????"); + + + if (targetDataLine.isControlSupported(FloatControl.Type.MASTER_GAIN)) { +System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + FloatControl gainControl = (FloatControl) getTargetDataLine().getControl(FloatControl.Type.MASTER_GAIN); + gainControl.setValue(40); + } + } catch (Exception e) { + try { + FloatControl gainControl = (FloatControl) getTargetDataLine().getControl(FloatControl.Type.VOLUME); + gainControl.setValue(-10); + } catch (Exception e1) { + e1.printStackTrace(); + } + } getTargetDataLine().open(getAudioFormat()); + getTargetDataLine().start(); } catch (LineUnavailableException e) { // TODO Auto-generated catch block diff --git a/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecognizer.java b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecognizer.java index 063f3d8..dd2fc14 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecognizer.java +++ b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecognizer.java @@ -7,6 +7,7 @@ import org.json.JSONObject; import javax.sound.sampled.AudioInputStream; +import java.io.IOException; import java.util.Base64; import java.util.Map; @@ -138,7 +139,8 @@ public LexResponse getRecognizedDataForStream(AudioInputStream stream, String se .withContentType("audio/l16; rate=16000; channels=1") .withSessionAttributes(sessionAttributes); - System.out.println("sending request to Lex: " + request); +// System.out.println("sending request to Lex: " + request); +// try {System.out.println(">> " + stream.available());} catch (IOException e) {} PostContentResult result = lex.postContent(request); return new LexResponse(result); diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/AbstractVAD.java b/src/main/java/com/darkprograms/speech/recognizer/vad/AbstractVAD.java index e0d4f34..fe8feb4 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/vad/AbstractVAD.java +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/AbstractVAD.java @@ -22,14 +22,14 @@ public abstract class AbstractVAD implements VoiceActivityDetector, Runnable { private static final int MIN_SPEECH_WINDOWS = MIN_SPEECH_MILLIS / WINDOW_MILLIS; protected AudioInputStream audio; - protected MicrophoneAnalyzer mic; - protected VoiceActivityListener listener; - protected VadState state; - - protected int maxSpeechMs; - protected int maxSpeechWindows; - protected int silenceCount; - protected int speechCount; + MicrophoneAnalyzer mic; + private VoiceActivityListener listener; + private VadState state; + + private int maxSpeechMs; + private int maxSpeechWindows; + int silenceCount; + private int speechCount; private int offset; private int bufferSize; diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/RecordingListener.java b/src/main/java/com/darkprograms/speech/recognizer/vad/RecordingListener.java index 68f4d33..5eccaf5 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/vad/RecordingListener.java +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/RecordingListener.java @@ -16,7 +16,7 @@ public class RecordingListener implements VoiceActivityListener { @Override public void onVoiceActivity(AudioInputStream audioInputStream) { String fileName = new Date().toString() + ".wav"; - File out = new File("/temp", fileName); + File out = new File("/tmp", fileName); try { System.out.println("Saving recoring to " + out.getAbsolutePath()); diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/SimpleVAD.java b/src/main/java/com/darkprograms/speech/recognizer/vad/SimpleVAD.java index 39eed1d..299f53a 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/vad/SimpleVAD.java +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/SimpleVAD.java @@ -24,7 +24,7 @@ public void run() { @Override protected boolean sampleForSpeech(byte[] audioData) { int volume = mic.calculateRMSLevel(audioData); -System.out.println(volume); +//System.out.println(volume); if (volume > ambientVolume + threshold) { speakingVolume = volume; speaking = true; From 4fbf08d3452c8eaa73f76fbd3d41f6d982ab5fa1 Mon Sep 17 00:00:00 2001 From: Nicholas Albion Date: Tue, 18 Jul 2017 00:32:21 +1000 Subject: [PATCH 20/20] extended interface for Idear --- .../speech/recognizer/vad/AbstractVAD.java | 51 ++++++++++++++++--- .../recognizer/vad/VoiceActivityDetector.java | 4 ++ 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/AbstractVAD.java b/src/main/java/com/darkprograms/speech/recognizer/vad/AbstractVAD.java index fe8feb4..f2ddb2a 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/vad/AbstractVAD.java +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/AbstractVAD.java @@ -1,5 +1,6 @@ package com.darkprograms.speech.recognizer.vad; +import com.darkprograms.speech.microphone.Microphone; import com.darkprograms.speech.microphone.MicrophoneAnalyzer; import javax.sound.sampled.AudioInputStream; @@ -25,6 +26,7 @@ public abstract class AbstractVAD implements VoiceActivityDetector, Runnable { MicrophoneAnalyzer mic; private VoiceActivityListener listener; private VadState state; + private Thread thread; private int maxSpeechMs; private int maxSpeechWindows; @@ -41,13 +43,46 @@ public void detectVoiceActivity(MicrophoneAnalyzer mic, VoiceActivityListener li } /** Initialise the VAD and start a thread */ + @Override public void detectVoiceActivity(MicrophoneAnalyzer mic, int maxSpeechMs, VoiceActivityListener listener) { this.listener = listener; - this.mic = mic; - this.audio = mic.captureAudioToStream(); this.maxSpeechMs = maxSpeechMs; maxSpeechWindows = maxSpeechMs / WINDOW_MILLIS; - new Thread(this, "JARVIS-VAD").start(); + + if (this.mic != null) { + if (this.mic == mic) { + // re-open the same mic + if (mic.getState() == Microphone.CaptureState.CLOSED) { + mic.open(); + } + return; + } else { + // swap mics + this.audio = mic.captureAudioToStream(); + this.mic.close(); + } + } else { + this.audio = mic.captureAudioToStream(); + } + + this.mic = mic; + } + + @Override + public void setVoiceActivityListener(VoiceActivityListener listener) { + this.listener = listener; + } + + @Override + public void start() { + thread = new Thread(this, "JARVIS-VAD"); + thread.start(); + } + + @Override + public void terminate() { +// state = VadState.CLOSED; + thread.interrupt(); } /** @@ -67,7 +102,7 @@ public void run() { state = VoiceActivityDetector.VadState.LISTENING; - while (true) { + while (state != VadState.CLOSED) { try { int bytesRead = this.audio.read(audioData, 0, bytesToRead); boolean speechDetected = sampleForSpeech(audioData); @@ -101,11 +136,11 @@ protected void incrementSpeechCounter(boolean speechDetected, int bytesRead, byt offset += bytesRead; if (speechCount >= maxSpeechWindows) { - // in theory, this should be handled by the following end of buffer handler + System.out.println("in theory, this should be handled by the following end of buffer handler"); emitVoiceActivity(outBuffer); } } else { - // Reached the end of the buffer! Send what we've captured so far + System.out.println("Reached the end of the buffer! Send what we've captured so far"); bytesRead = bufferSize - offset; outBuffer.write(audioData, 0, bytesRead); emitVoiceActivity(outBuffer); @@ -117,7 +152,7 @@ protected void incrementSpeechCounter(boolean speechDetected, int bytesRead, byt // Ignore silence runs less than 10 successive frames. if (state == VoiceActivityDetector.VadState.DETECTED_SPEECH && silenceCount >= IGNORE_SILENCE_WINDOWS) { if (silenceCount >= MAX_SILENCE_WINDOWS && speechCount >= MIN_SPEECH_WINDOWS) { - // We have silence after a chunk of speech worth processing + System.out.println("We have silence after a chunk of speech worth processing"); emitVoiceActivity(outBuffer); } else { state = VoiceActivityDetector.VadState.DETECTED_SILENCE_AFTER_SPEECH; @@ -136,7 +171,7 @@ protected void emitVoiceActivity(ByteArrayOutputStream outBuffer) { } protected AudioInputStream createVoiceActivityStream(ByteArrayOutputStream outBuffer) { + System.out.println("speech: " + mic.getAudioFormat().getFrameSize() * mic.getNumOfFrames(outBuffer.size())); return new AudioInputStream(new ByteArrayInputStream(outBuffer.toByteArray()), audio.getFormat(), mic.getNumOfFrames(outBuffer.size())); } - } diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java index 011d8a1..6496307 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java @@ -19,7 +19,11 @@ enum VadState { CLOSED } + void start(); + void terminate(); + // TODO: optionally provide PipedInputStream to support streaming recognition on Google void detectVoiceActivity(MicrophoneAnalyzer mic, VoiceActivityListener listener); void detectVoiceActivity(MicrophoneAnalyzer mic, int maxSpeechMs, VoiceActivityListener listener); + void setVoiceActivityListener(VoiceActivityListener listener); }