diff --git a/README.markdown b/README.markdown index e6bfcf7..9c48174 100644 --- a/README.markdown +++ b/README.markdown @@ -11,6 +11,7 @@ The J.A.R.V.I.S. Speech API is designed to be simple and efficient, using the sp The API currently provides the following functionality, * Microphone Capture API (Wrapped around the current Java API for simplicity) + * Voice Activity Detector * A speech recognizer using Google's recognizer service * Converts WAVE files from microphone input to FLAC (using existing API, see CREDITS) * Retrieves Response from Google, including confidence score and text diff --git a/pom.xml b/pom.xml index 0c80b1c..d750e70 100644 --- a/pom.xml +++ b/pom.xml @@ -4,11 +4,13 @@ com.darkprograms.speech java-speech-api - 1.13.0-SNAPSHOT + 2.0.0-SNAPSHOT jar UTF-8 + 1.8 + 1.8 java-speech-api @@ -69,6 +71,10 @@ https://github.com/AranHase AranHase + + https://github.com/nalbion + nalbion + @@ -82,6 +88,11 @@ json 20150729 + + com.amazonaws + aws-java-sdk-lex + 1.11.160 + diff --git a/src/main/java/com/darkprograms/speech/recognizer/FlacEncoder.java b/src/main/java/com/darkprograms/speech/encoding/FlacEncoder.java similarity index 98% rename from src/main/java/com/darkprograms/speech/recognizer/FlacEncoder.java rename to src/main/java/com/darkprograms/speech/encoding/FlacEncoder.java index 180ab9a..d3def56 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/FlacEncoder.java +++ b/src/main/java/com/darkprograms/speech/encoding/FlacEncoder.java @@ -1,4 +1,4 @@ -package com.darkprograms.speech.recognizer; +package com.darkprograms.speech.encoding; import net.sourceforge.javaflacencoder.FLACEncoder; import net.sourceforge.javaflacencoder.FLACFileOutputStream; @@ -32,8 +32,6 @@ public FlacEncoder() { * @param outputFile Output FLAC file */ public void convertWaveToFlac(File inputFile, File outputFile) { - - StreamConfiguration streamConfiguration = new StreamConfiguration(); streamConfiguration.setSampleRate(8000); streamConfiguration.setBitsPerSample(16); diff --git a/src/main/java/com/darkprograms/speech/microphone/Microphone.java b/src/main/java/com/darkprograms/speech/microphone/Microphone.java index 7859050..2748626 100644 --- a/src/main/java/com/darkprograms/speech/microphone/Microphone.java +++ b/src/main/java/com/darkprograms/speech/microphone/Microphone.java @@ -40,6 +40,9 @@ public enum CaptureState { */ private File audioFile; + private AudioInputStream audioStream; + private float sampleRate; + /** * Constructor * @@ -52,6 +55,15 @@ public Microphone(AudioFileFormat.Type fileType) { initTargetDataLine(); } + /** + * Constructor for use with {@link #captureAudioToStream()} + * @param sampleRate samples per second - 16_000 (recommended) or 8_000 + */ + public Microphone(float sampleRate) { + setState(CaptureState.CLOSED); + initTargetDataLine(sampleRate); + } + /** * Gets the current state of Microphone * @@ -100,18 +112,33 @@ public void setTargetDataLine(TargetDataLine targetDataLine) { /** * Initializes the target data line. */ - private void initTargetDataLine(){ + private TargetDataLine initTargetDataLine() { + return initTargetDataLine(8_000F); + } + private TargetDataLine initTargetDataLine(float sampleRate) { + this.sampleRate = sampleRate; DataLine.Info dataLineInfo = new DataLine.Info(TargetDataLine.class, getAudioFormat()); try { - setTargetDataLine((TargetDataLine) AudioSystem.getLine(dataLineInfo)); + TargetDataLine targetDataLine = (TargetDataLine)AudioSystem.getLine(dataLineInfo); + setTargetDataLine(targetDataLine); + return targetDataLine; } catch (LineUnavailableException e) { // TODO Auto-generated catch block e.printStackTrace(); - return; + return null; } - } + public AudioInputStream captureAudioToStream() { + setState(CaptureState.STARTING_CAPTURE); + if(getTargetDataLine() == null) { + initTargetDataLine(); + } + + open(); + audioStream = new AudioInputStream(getTargetDataLine()); + return audioStream; + } /** * Captures audio from the microphone and saves it a file @@ -129,8 +156,6 @@ public void captureAudioToFile(File audioFile) throws LineUnavailableException { //Get Audio new Thread(new CaptureThread()).start(); - - } /** @@ -144,14 +169,19 @@ public void captureAudioToFile(String audioFile) throws LineUnavailableException captureAudioToFile(file); } - /** * The audio format to save in * * @return Returns AudioFormat to be used later when capturing audio from microphone */ public AudioFormat getAudioFormat() { - float sampleRate = 8000.0F; + return getAudioFormat(sampleRate); + } + + /** + * @param sampleRate set to 16_000.0F for AWS Lex + */ + public AudioFormat getAudioFormat(float sampleRate) { //8000,11025,16000,22050,44100 int sampleSizeInBits = 16; //8,16 @@ -172,10 +202,40 @@ public void open(){ if(getTargetDataLine()==null){ initTargetDataLine(); } - if(!getTargetDataLine().isOpen() && !getTargetDataLine().isRunning() && !getTargetDataLine().isActive()){ + TargetDataLine targetDataLine = getTargetDataLine(); + if(!targetDataLine.isOpen() && !targetDataLine.isRunning() && !targetDataLine.isActive()) { try { setState(CaptureState.PROCESSING_AUDIO); + + try { +System.out.println("???????????????????????????????????????????????????????????"); + System.out.println("???????????????????????????????????????????????????????????");System.out.println("???????????????????????????????????????????????????????????");System.out.println("???????????????????????????????????????????????????????????"); + System.out.println("???????????????????????????????????????????????????????????"); + + + if (targetDataLine.isControlSupported(FloatControl.Type.MASTER_GAIN)) { +System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Setting gain!!!!!!!!!!!!!!"); + FloatControl gainControl = (FloatControl) getTargetDataLine().getControl(FloatControl.Type.MASTER_GAIN); + gainControl.setValue(40); + } + } catch (Exception e) { + try { + FloatControl gainControl = (FloatControl) getTargetDataLine().getControl(FloatControl.Type.VOLUME); + gainControl.setValue(-10); + } catch (Exception e1) { + e1.printStackTrace(); + } + } getTargetDataLine().open(getAudioFormat()); + getTargetDataLine().start(); } catch (LineUnavailableException e) { // TODO Auto-generated catch block @@ -183,7 +243,6 @@ public void open(){ return; } } - } /** @@ -203,7 +262,6 @@ public void close() { * Thread to capture the audio from the microphone and save it to a file */ private class CaptureThread implements Runnable { - /** * Run method for thread */ @@ -220,4 +278,14 @@ public void run() { } } + /*private class ListenThread implements Runnable { + public void run() { + try { + open(); + audioStream = new AudioInputStream(getTargetDataLine()); + } catch (Exception ex) { + ex.printStackTrace(); + } + } + }*/ } diff --git a/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java b/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java index e757a89..a064235 100644 --- a/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java +++ b/src/main/java/com/darkprograms/speech/microphone/MicrophoneAnalyzer.java @@ -1,6 +1,8 @@ package com.darkprograms.speech.microphone; import javax.sound.sampled.AudioFileFormat; +import javax.sound.sampled.AudioFormat; + import com.darkprograms.speech.util.*; /******************************************************************************************** @@ -20,6 +22,14 @@ public class MicrophoneAnalyzer extends Microphone { public MicrophoneAnalyzer(AudioFileFormat.Type fileType){ super(fileType); } + + /** + * Constructor for use with {@link #captureAudioToStream()} + * @param sampleRate samples per second - 16_000 (recommended) or 8_000 + */ + public MicrophoneAnalyzer(float sampleRate){ + super(sampleRate); + } /** * Gets the volume of the microphone input @@ -85,8 +95,14 @@ public int getNumOfBytes(int seconds){ * @param seconds The length in seconds * @return the number of bytes the microphone will output over the specified time. */ - public int getNumOfBytes(double seconds){ - return (int)(seconds*getAudioFormat().getSampleRate()*getAudioFormat().getFrameSize()+.5); + public int getNumOfBytes(double seconds) { + AudioFormat format = getAudioFormat(); + return (int)(seconds * format.getSampleRate() * format.getFrameSize() + .5); + } + + public int getNumOfFrames(int bytes) { + AudioFormat format = getAudioFormat(); + return bytes / format.getFrameSize(); } /** @@ -94,8 +110,8 @@ public int getNumOfBytes(double seconds){ * @param numOfBytes The length of the returned array. * @return The specified array or null if it cannot. */ - private byte[] getBytes(int numOfBytes){ - if(getTargetDataLine()!=null){ + private byte[] getBytes(int numOfBytes) { + if (getTargetDataLine()!=null) { byte[] data = new byte[numOfBytes]; this.getTargetDataLine().read(data, 0, numOfBytes); return data; @@ -110,7 +126,7 @@ private byte[] getBytes(int numOfBytes){ * be in error due to the complex nature of sound. This feature is in Beta * @return The frequency of the sound in Hertz. */ - public int getFrequency(){ + public int getFrequency() { try { return getFrequency(4096); } catch (Exception e) { @@ -149,12 +165,82 @@ public int getFrequency(byte[] bytes){ Complex[] fftTransformed = FFT.fft(complex); return this.calculateFundamentalFrequency(fftTransformed, 4); } + +/* *//** + * borrowed from http://www.programcreek.com/java-api-examples/index.php?source_dir=Audio-Descriptors-master/src/audio/descriptors/AudioDescriptor.java + * + * Spectral flatness provides a way to quantify how tone-like a sound is, as opposed to + * being noise-like. The meaning of tonal in this context is in the sense of the amount of peaks + * or resonant structure in a power spectrum, as opposed to flat spectrum of a white noise. + * + * @param bytes + * @return Spectral Flatness coefficient + *//* + public int calculateSpectralFlatness(byte[] bytes) { + // compute FFT + FFT f = new FFT(x.bufferSize(), x.sampleRate()); + f.window(FFT.HAMMING); + f.forward(x.right); + + float num = 1; + float den = 0; + float Si = 0; + float asf = 0; // result + + final int n = -8; + final int B = 24; // number of bands + final float loF = (float) (Math.pow(2, n/4.0) * 1000); // lowest frequency [Hz] + final float hiF = (float) (Math.pow(2, B/4.0) * loF); // highest frequency [Hz] + final int loK = f.freqToIndex(loF); + final int hiK = f.freqToIndex(hiF); + final float reduceFactor = hiK - loK + 1; + + for(int k=loK; kmax){ @@ -268,8 +354,9 @@ private double[] bytesToDoubleArray(byte[] bufferData){ final int bytesRecorded = bufferData.length; final int bytesPerSample = getAudioFormat().getSampleSizeInBits()/8; final double amplification = 100.0; // choose a number as you like - double[] micBufferData = new double[bytesRecorded - bytesPerSample +1]; - for (int index = 0, floatIndex = 0; index < bytesRecorded - bytesPerSample + 1; index += bytesPerSample, floatIndex++) { + int micBufferLength = bytesRecorded; // bytesRecorded - bytesPerSample +1 + double[] micBufferData = new double[micBufferLength]; + for (int index = 0, floatIndex = 0; index < micBufferLength; index += bytesPerSample, floatIndex++) { double sample = 0; for (int b = 0; b < bytesPerSample; b++) { int v = bufferData[index + b]; @@ -284,5 +371,4 @@ private double[] bytesToDoubleArray(byte[] bufferData){ } return micBufferData; } - } diff --git a/src/main/java/com/darkprograms/speech/recognizer/RecognitionResult.java b/src/main/java/com/darkprograms/speech/recognizer/RecognitionResult.java new file mode 100644 index 0000000..134aaae --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/RecognitionResult.java @@ -0,0 +1,7 @@ +package com.darkprograms.speech.recognizer; + +public interface RecognitionResult { + /** @return String representation of what was said */ + String getResponse(); + boolean isFinalResponse(); +} diff --git a/src/main/java/com/darkprograms/speech/recognizer/SpeechRecognizer.java b/src/main/java/com/darkprograms/speech/recognizer/SpeechRecognizer.java new file mode 100644 index 0000000..67483f3 --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/SpeechRecognizer.java @@ -0,0 +1,4 @@ +package com.darkprograms.speech.recognizer; + +public interface SpeechRecognizer { +} diff --git a/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecognizer.java b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecognizer.java new file mode 100644 index 0000000..dd2fc14 --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexRecognizer.java @@ -0,0 +1,148 @@ +package com.darkprograms.speech.recognizer.awslex; + +import com.amazonaws.services.lexruntime.AmazonLexRuntime; +import com.amazonaws.services.lexruntime.model.PostContentRequest; +import com.amazonaws.services.lexruntime.model.PostContentResult; +import com.darkprograms.speech.recognizer.RecognitionResult; +import org.json.JSONObject; + +import javax.sound.sampled.AudioInputStream; +import java.io.IOException; +import java.util.Base64; +import java.util.Map; + +/** + * example: + *
+ LexRecognizer lex = new LexRecognizer(AmazonLexRuntimeClientBuilder.defaultClient(), "MyLexBot", "PROD", "auser");
+ MicrophoneAnalyzer mic = new MicrophoneAnalyzer(null);
+ VoiceActivityDetector vad = new VoiceActivityDetector();
+
+ vad.detectVoiceActivity(mic, audioInputStream -> {
+    PostContentResult result = lex.getRecognizedDataForStream(audioInputStream, myApp.getSessionAttributes()).getResult();
+    System.out.println(result.message);
+ });
+ * 
+ */ +public class LexRecognizer { + private AmazonLexRuntime lex; + private String botName; + private String botAlias; + private String userId; + + public LexRecognizer(AmazonLexRuntime lex, String botName, String botAlias, String userId) { + this.lex = lex; + this.botName = botName; + this.botAlias = botAlias; + this.userId = userId; + } + + public void setUserId(String userId) { + this.userId = userId; + } + + public LexResponse getRecognizedDataForStream(AudioInputStream stream) { + return getRecognizedDataForStream(stream, (String)null); + } + + /** + * @see #getRecognizedDataForStreamWithObjects(AudioInputStream, Map) + * @param stream + * @param sessionAttributes simple key:value attributes + * @return + */ + public RecognitionResult getRecognizedDataForStream(AudioInputStream stream, Map sessionAttributes) { + String json; + if (sessionAttributes == null || sessionAttributes.isEmpty()) { + json = null; + } else { + StringBuilder str = null; + + for (Map.Entry entry : sessionAttributes.entrySet()) { + String key = entry.getKey(); + String value = entry.getValue(); + + if (str == null) { + str = new StringBuilder("{"); + } else { + str.append(","); + } + + str.append("\"").append(key).append("\":"); + if (value == null) { + str.append("null"); + } else { + str.append(JSONObject.quote(value)); + } + } + + json = str.append("}").toString(); + } + + return getRecognizedDataForStream(stream, new String(Base64.getEncoder().encode(json.getBytes()))); + } + + /** + * Each value of sesssionAttributes will be converted to a String containing JSON + * @param stream + * @param sessionAttributes + * @return + */ + public RecognitionResult getRecognizedDataForStreamWithObjects(AudioInputStream stream, Map sessionAttributes) { + String json; + if (sessionAttributes == null || sessionAttributes.isEmpty()) { + json = null; + } else { + StringBuilder str = null; + + for (Map.Entry entry : sessionAttributes.entrySet()) { + String key = entry.getKey(); + Object value = entry.getValue(); + + if (str == null) { + str = new StringBuilder("{"); + } else { + str.append(","); + } + + str.append("\"").append(key).append("\":"); +// if (value == null) { +// str.append("null"); +// } else if (value instanceof Number) { +// str.append(JSONObject.numberToString((Number)value)); +// } else if (value instanceof Boolean) { +// str.append(((Boolean)value).toString()); +// } else if (value instanceof String) { +// str.append("\"").append((String)value).append("\""); +// } else { + str.append(JSONObject.valueToString(value)); +// } + } + + json = str.append("}").toString(); + } + + return getRecognizedDataForStream(stream, new String(Base64.getEncoder().encode(json.getBytes()))); + } + + /** + * @param stream + * @param sessionAttributes The value must be map (keys and values must be strings) that is JSON serialized and then base64 encoded + * @return + */ + public LexResponse getRecognizedDataForStream(AudioInputStream stream, String sessionAttributes) { + PostContentRequest request = new PostContentRequest() + .withBotName(botName) + .withBotAlias(botAlias) + .withUserId(userId) + .withInputStream(stream) + .withContentType("audio/l16; rate=16000; channels=1") + .withSessionAttributes(sessionAttributes); + +// System.out.println("sending request to Lex: " + request); +// try {System.out.println(">> " + stream.available());} catch (IOException e) {} + + PostContentResult result = lex.postContent(request); + return new LexResponse(result); + } +} diff --git a/src/main/java/com/darkprograms/speech/recognizer/awslex/LexResponse.java b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexResponse.java new file mode 100644 index 0000000..f987095 --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/awslex/LexResponse.java @@ -0,0 +1,38 @@ +package com.darkprograms.speech.recognizer.awslex; + +import com.amazonaws.services.lexruntime.model.PostContentResult; +import com.darkprograms.speech.recognizer.RecognitionResult; + +public class LexResponse implements RecognitionResult { + private PostContentResult result; +// private String response; + + public LexResponse(PostContentResult result) { + this.result = result; +// this.response = result.getInputTranscript(); + + // Close - Fulfilled or Failed (ReadyForFulfillment?) + // Incomplete - ElicitIntent, ConfirmIntent, ElicitSlot +// result.getDialogState(); +// result.getIntentName(); +// result.getMessage(); +// result.getSessionAttributes(); +// result.getSlots(); +// result.getSlotToElicit(); +// No card?!!! + } + + public String getResponse() { + return result.getInputTranscript(); +// return null; + } + + public boolean isFinalResponse() { + String state = result.getDialogState(); + return "Fulfilled".equals(state) || "Failed".equals(state); + } + + public PostContentResult getResult() { + return result; + } +} diff --git a/src/main/java/com/darkprograms/speech/recognizer/GSpeechDuplex.java b/src/main/java/com/darkprograms/speech/recognizer/google/GSpeechDuplex.java similarity index 99% rename from src/main/java/com/darkprograms/speech/recognizer/GSpeechDuplex.java rename to src/main/java/com/darkprograms/speech/recognizer/google/GSpeechDuplex.java index a66e844..ed4421d 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/GSpeechDuplex.java +++ b/src/main/java/com/darkprograms/speech/recognizer/google/GSpeechDuplex.java @@ -1,4 +1,4 @@ -package com.darkprograms.speech.recognizer; +package com.darkprograms.speech.recognizer.google; import java.io.File; import java.io.IOException; @@ -184,7 +184,7 @@ public void recognize(TargetDataLine tl, AudioFormat af) throws IOException, Lin /** * This code opens a new Thread that connects to the downstream URL. Due to threading, * the best way to handle this is through the use of listeners. - * @param The URL you want to connect to. + * @param urlStr The URL you want to connect to. */ private Thread downChannel(String urlStr) { final String url = urlStr; diff --git a/src/main/java/com/darkprograms/speech/recognizer/GSpeechResponseListener.java b/src/main/java/com/darkprograms/speech/recognizer/google/GSpeechResponseListener.java similarity index 75% rename from src/main/java/com/darkprograms/speech/recognizer/GSpeechResponseListener.java rename to src/main/java/com/darkprograms/speech/recognizer/google/GSpeechResponseListener.java index dcbbf2a..aca0228 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/GSpeechResponseListener.java +++ b/src/main/java/com/darkprograms/speech/recognizer/google/GSpeechResponseListener.java @@ -1,4 +1,5 @@ -package com.darkprograms.speech.recognizer; +package com.darkprograms.speech.recognizer.google + ; /** * Response listeners for URL connections. diff --git a/src/main/java/com/darkprograms/speech/recognizer/GoogleResponse.java b/src/main/java/com/darkprograms/speech/recognizer/google/GoogleResponse.java similarity index 92% rename from src/main/java/com/darkprograms/speech/recognizer/GoogleResponse.java rename to src/main/java/com/darkprograms/speech/recognizer/google/GoogleResponse.java index 73a86f4..666dadc 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/GoogleResponse.java +++ b/src/main/java/com/darkprograms/speech/recognizer/google/GoogleResponse.java @@ -1,4 +1,6 @@ -package com.darkprograms.speech.recognizer; +package com.darkprograms.speech.recognizer.google; + +import com.darkprograms.speech.recognizer.RecognitionResult; import java.util.ArrayList; import java.util.List; @@ -8,7 +10,7 @@ * * @author Luke Kuza, Duncan Jauncey, Aaron Gokaslan ******************************************************************************/ -public class GoogleResponse { +public class GoogleResponse implements RecognitionResult { /** * Variable that holds the response diff --git a/src/main/java/com/darkprograms/speech/recognizer/Recognizer.java b/src/main/java/com/darkprograms/speech/recognizer/google/Recognizer.java similarity index 98% rename from src/main/java/com/darkprograms/speech/recognizer/Recognizer.java rename to src/main/java/com/darkprograms/speech/recognizer/google/Recognizer.java index cac98a3..431b724 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/Recognizer.java +++ b/src/main/java/com/darkprograms/speech/recognizer/google/Recognizer.java @@ -1,4 +1,4 @@ -package com.darkprograms.speech.recognizer; +package com.darkprograms.speech.recognizer.google; import java.util.*; import java.io.*; @@ -6,14 +6,17 @@ import java.net.URLConnection; import java.nio.charset.Charset; +import com.darkprograms.speech.recognizer.SpeechRecognizer; import org.json.*; +import com.darkprograms.speech.encoding.FlacEncoder; + /*************************************************************** * Class that submits FLAC audio and retrieves recognized text * * @author Luke Kuza, Duncan Jauncey, Aaron Gokaslan **************************************************************/ -public class Recognizer { +public class Recognizer implements SpeechRecognizer { public enum Languages{ AUTO_DETECT("auto"),//tells Google to auto-detect the language diff --git a/src/main/java/com/darkprograms/speech/recognizer/RecognizerChunked.java b/src/main/java/com/darkprograms/speech/recognizer/google/RecognizerChunked.java similarity index 99% rename from src/main/java/com/darkprograms/speech/recognizer/RecognizerChunked.java rename to src/main/java/com/darkprograms/speech/recognizer/google/RecognizerChunked.java index 160b395..62614df 100644 --- a/src/main/java/com/darkprograms/speech/recognizer/RecognizerChunked.java +++ b/src/main/java/com/darkprograms/speech/recognizer/google/RecognizerChunked.java @@ -1,4 +1,4 @@ -package com.darkprograms.speech.recognizer; +package com.darkprograms.speech.recognizer.google; import java.io.BufferedReader; diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/AbstractVAD.java b/src/main/java/com/darkprograms/speech/recognizer/vad/AbstractVAD.java new file mode 100644 index 0000000..f2ddb2a --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/AbstractVAD.java @@ -0,0 +1,177 @@ +package com.darkprograms.speech.recognizer.vad; + +import com.darkprograms.speech.microphone.Microphone; +import com.darkprograms.speech.microphone.MicrophoneAnalyzer; + +import javax.sound.sampled.AudioInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; + + +public abstract class AbstractVAD implements VoiceActivityDetector, Runnable { + private static final int WINDOW_MILLIS = 16; + private static final int IGNORE_SILENCE_WINDOWS = 10; + private static final int IGNORE_SPEECH_WINDOWS = 5; + /** maximum ms between words */ + private static final int MAX_SILENCE_MILLIS = 4; + /** minimum duration of speech to recognise */ + private static final int MIN_SPEECH_MILLIS = 200; + private static final double WINDOW_SECONDS = (double)WINDOW_MILLIS / 1000; + /** Google does not allow recordings over 1 minute, but 10 seconds should be ample */ + private static final int MAX_SPEECH_MILLIS = 10_000; + private static final int MAX_SILENCE_WINDOWS = MAX_SILENCE_MILLIS / WINDOW_MILLIS; + private static final int MIN_SPEECH_WINDOWS = MIN_SPEECH_MILLIS / WINDOW_MILLIS; + + protected AudioInputStream audio; + MicrophoneAnalyzer mic; + private VoiceActivityListener listener; + private VadState state; + private Thread thread; + + private int maxSpeechMs; + private int maxSpeechWindows; + int silenceCount; + private int speechCount; + + private int offset; + private int bufferSize; + private ByteArrayOutputStream outBuffer; + + // TODO: optionally provide PipedInputStream to support streaming recogntion on Google + public void detectVoiceActivity(MicrophoneAnalyzer mic, VoiceActivityListener listener) { + detectVoiceActivity(mic, MAX_SPEECH_MILLIS, listener); + } + + /** Initialise the VAD and start a thread */ + @Override + public void detectVoiceActivity(MicrophoneAnalyzer mic, int maxSpeechMs, VoiceActivityListener listener) { + this.listener = listener; + this.maxSpeechMs = maxSpeechMs; + maxSpeechWindows = maxSpeechMs / WINDOW_MILLIS; + + if (this.mic != null) { + if (this.mic == mic) { + // re-open the same mic + if (mic.getState() == Microphone.CaptureState.CLOSED) { + mic.open(); + } + return; + } else { + // swap mics + this.audio = mic.captureAudioToStream(); + this.mic.close(); + } + } else { + this.audio = mic.captureAudioToStream(); + } + + this.mic = mic; + } + + @Override + public void setVoiceActivityListener(VoiceActivityListener listener) { + this.listener = listener; + } + + @Override + public void start() { + thread = new Thread(this, "JARVIS-VAD"); + thread.start(); + } + + @Override + public void terminate() { +// state = VadState.CLOSED; + thread.interrupt(); + } + + /** + * Continuously reads "windows" of audio into a buffer and delegates to {@link #sampleForSpeech(byte[])} + * and {@link #incrementSpeechCounter(boolean, int, byte[])}. + * {@link #emitVoiceActivity(ByteArrayOutputStream)} will be called when an utterance has been captured. + */ + @Override + public void run() { + int bytesToRead = mic.getNumOfBytes(WINDOW_SECONDS); + byte[] audioData = new byte[bytesToRead]; + bufferSize = maxSpeechMs * this.mic.getNumOfBytes(0.001); + silenceCount = 0; + speechCount = 0; + offset = 0; + outBuffer = new ByteArrayOutputStream(bufferSize); + + state = VoiceActivityDetector.VadState.LISTENING; + + while (state != VadState.CLOSED) { + try { + int bytesRead = this.audio.read(audioData, 0, bytesToRead); + boolean speechDetected = sampleForSpeech(audioData); + incrementSpeechCounter(speechDetected, bytesRead, audioData); + } catch (Exception e) { + e.printStackTrace(); + state = VadState.CLOSED; + return; + } + } + } + + /** + * Executed from within the VAD thread + * @param audioData + * @return + */ + protected abstract boolean sampleForSpeech(byte[] audioData); + + protected void incrementSpeechCounter(boolean speechDetected, int bytesRead, byte[] audioData) { + if (speechDetected) { + speechCount++; + // Ignore speech runs less than 5 successive frames. + if (state != VoiceActivityDetector.VadState.DETECTED_SPEECH && speechCount >= IGNORE_SPEECH_WINDOWS) { + state = VoiceActivityDetector.VadState.DETECTED_SPEECH; + silenceCount = 0; + } + + if (offset + bytesRead < bufferSize) { + outBuffer.write(audioData, 0, bytesRead); + offset += bytesRead; + + if (speechCount >= maxSpeechWindows) { + System.out.println("in theory, this should be handled by the following end of buffer handler"); + emitVoiceActivity(outBuffer); + } + } else { + System.out.println("Reached the end of the buffer! Send what we've captured so far"); + bytesRead = bufferSize - offset; + outBuffer.write(audioData, 0, bytesRead); + emitVoiceActivity(outBuffer); + } + } else { + // silence + silenceCount++; + + // Ignore silence runs less than 10 successive frames. + if (state == VoiceActivityDetector.VadState.DETECTED_SPEECH && silenceCount >= IGNORE_SILENCE_WINDOWS) { + if (silenceCount >= MAX_SILENCE_WINDOWS && speechCount >= MIN_SPEECH_WINDOWS) { + System.out.println("We have silence after a chunk of speech worth processing"); + emitVoiceActivity(outBuffer); + } else { + state = VoiceActivityDetector.VadState.DETECTED_SILENCE_AFTER_SPEECH; + } + + speechCount = 0; + } + } + } + + protected void emitVoiceActivity(ByteArrayOutputStream outBuffer) { + listener.onVoiceActivity(createVoiceActivityStream(outBuffer)); + outBuffer.reset(); + offset = 0; + state = VadState.LISTENING; + } + + protected AudioInputStream createVoiceActivityStream(ByteArrayOutputStream outBuffer) { + System.out.println("speech: " + mic.getAudioFormat().getFrameSize() * mic.getNumOfFrames(outBuffer.size())); + return new AudioInputStream(new ByteArrayInputStream(outBuffer.toByteArray()), audio.getFormat(), mic.getNumOfFrames(outBuffer.size())); + } +} diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/MoattarHomayounpourVAD.java b/src/main/java/com/darkprograms/speech/recognizer/vad/MoattarHomayounpourVAD.java new file mode 100644 index 0000000..1f6a3bd --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/MoattarHomayounpourVAD.java @@ -0,0 +1,59 @@ +package com.darkprograms.speech.recognizer.vad; + +/** + * Implementation of [https://www.researchgate.net/publication/255667085_A_simple_but_efficient_real-time_voice_activity_detection_algorithm] + * + * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + * !! WARNING - this is not working correctly !! + * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + * + * TODO: need to calculate Spectral Flatness Measure + */ +public class MoattarHomayounpourVAD extends AbstractVAD { + private static final int ENERGY_PRIMARY_THRESHOLD = 40; + private static final int FREQUENCY_PRIMARY_THRESHOLD = 185; + private static final int SPECTRAL_FLATNESS_PRIMARY_THRESHOLD = 5; + + private int minEnergy = Integer.MAX_VALUE; + private int minFrequency = Integer.MAX_VALUE; + private int minSpectralFlatness = Integer.MAX_VALUE; + + @Override + public void run() { + minEnergy = Integer.MAX_VALUE; + minFrequency = Integer.MAX_VALUE; + minSpectralFlatness = Integer.MAX_VALUE; + super.run(); + } + + @Override + protected boolean sampleForSpeech(byte[] audioData) { + int counter = 0; + int energy = mic.calculateRMSLevel(audioData); + int frequency = mic.getFrequency(audioData); + + // ignore frequencies above 400hz (and below 50Hz?) + if (frequency < 400) { + // 3-2-2- Compute the abstract value of Spectral Flatness Measure SFM(i) +// TODO https://github.com/filipeuva/SoundBites/blob/master/src/uk/co/biogen/SoundBites/analysis/AnalysisInterface.java#L264 + + // 3-3- Supposing that some of the first 30 frames are silence, find the minimum value for E, F & SF + minEnergy = Math.min(minEnergy, energy); + minFrequency = Math.min(minFrequency, frequency); +// minSpectralFlatness = Math.min(minSpectralFlatness, energy); + + double energyThreshold = ENERGY_PRIMARY_THRESHOLD * Math.log(minEnergy); + System.out.println("energy: " + energy + "\tfrequency:" + frequency); + if (energy - minEnergy >= energyThreshold) counter++; + if (frequency - minFrequency >= FREQUENCY_PRIMARY_THRESHOLD) counter++; +// if (sfm - minSpectralFlatness) >= SPECTRAL_FLATNESS_PRIMARY_THRESHOLD) counter++; + } + + if(counter > 1) { + return true; + } else { + minEnergy = ((silenceCount * minEnergy) + energy) / (silenceCount + 1); + return false; + } + } +} diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/RecordingListener.java b/src/main/java/com/darkprograms/speech/recognizer/vad/RecordingListener.java new file mode 100644 index 0000000..5eccaf5 --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/RecordingListener.java @@ -0,0 +1,41 @@ +package com.darkprograms.speech.recognizer.vad; + +import javax.sound.sampled.AudioFileFormat; +import javax.sound.sampled.AudioInputStream; +import javax.sound.sampled.AudioSystem; +import java.io.File; +import java.io.IOException; +import java.util.Date; + +/** + * Useful for debugging & testing microphone + */ +public class RecordingListener implements VoiceActivityListener { + private VoiceActivityListener nextListener; + + @Override + public void onVoiceActivity(AudioInputStream audioInputStream) { + String fileName = new Date().toString() + ".wav"; + File out = new File("/tmp", fileName); + + try { + System.out.println("Saving recoring to " + out.getAbsolutePath()); + AudioSystem.write(audioInputStream, AudioFileFormat.Type.WAVE, out); + } catch (IOException e) { + e.printStackTrace(); + } + + if (nextListener != null) { + nextListener.onVoiceActivity(audioInputStream); + } + } + + public RecordingListener withNextListener(VoiceActivityListener nextListener) { + this.nextListener = nextListener; + return this; + } + + public void setNextListener(VoiceActivityListener nextListener) { + this.nextListener = nextListener; + } +} diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/SimpleVAD.java b/src/main/java/com/darkprograms/speech/recognizer/vad/SimpleVAD.java new file mode 100644 index 0000000..299f53a --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/SimpleVAD.java @@ -0,0 +1,37 @@ +package com.darkprograms.speech.recognizer.vad; + +/** + * Adapted from https://stackoverflow.com/questions/18815235/can-i-use-google-speech-recognition-api-in-my-desktop-application + */ +public class SimpleVAD extends AbstractVAD { + private int threshold = 10; + private int ambientVolume; + private int speakingVolume; + private boolean speaking; + + public void setThreshold(int threshold) { + this.threshold = threshold; + } + + @Override + public void run() { + speakingVolume = -2; + speaking = false; + ambientVolume = mic.getAudioVolume(); + super.run(); + } + + @Override + protected boolean sampleForSpeech(byte[] audioData) { + int volume = mic.calculateRMSLevel(audioData); +//System.out.println(volume); + if (volume > ambientVolume + threshold) { + speakingVolume = volume; + speaking = true; + } + if (speaking && volume + threshold < speakingVolume) { + speaking = false; + } + return speaking; + } +} diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java new file mode 100644 index 0000000..6496307 --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityDetector.java @@ -0,0 +1,29 @@ +package com.darkprograms.speech.recognizer.vad; + +import com.darkprograms.speech.microphone.MicrophoneAnalyzer; +import com.darkprograms.speech.util.FFT; + +import javax.sound.sampled.AudioInputStream; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +/** + * @see [https://github.com/Sciss/SpeechRecognitionHMM/blob/master/src/main/java/org/ioe/tprsa/audio/preProcessings/EndPointDetection.java] + */ +public interface VoiceActivityDetector { + enum VadState { + LISTENING, + DETECTED_SPEECH, + DETECTED_SILENCE_AFTER_SPEECH, + CLOSED + } + + void start(); + void terminate(); + + // TODO: optionally provide PipedInputStream to support streaming recognition on Google + void detectVoiceActivity(MicrophoneAnalyzer mic, VoiceActivityListener listener); + void detectVoiceActivity(MicrophoneAnalyzer mic, int maxSpeechMs, VoiceActivityListener listener); + void setVoiceActivityListener(VoiceActivityListener listener); +} diff --git a/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityListener.java b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityListener.java new file mode 100644 index 0000000..ebd1fe9 --- /dev/null +++ b/src/main/java/com/darkprograms/speech/recognizer/vad/VoiceActivityListener.java @@ -0,0 +1,7 @@ +package com.darkprograms.speech.recognizer.vad; + +import javax.sound.sampled.AudioInputStream; + +public interface VoiceActivityListener { + void onVoiceActivity(AudioInputStream audioInputStream); +}