## Nate Nichols, August 14, 2006 ## ndnichols@cs.northwestern.edu ## Information Laboratory, Northwestern University import pyTTS import time import pygame import shutil import os import win32api import getopt import sys phonemeMapping = {} def SetupMapping(): '''The numbers returned by SAPI need to be converted before appending them to the WAV. I got this conversion by brute-force looking through WAV files with the phonemes I needed. I think they're all correct.''' global phonemeMapping a = {} a[10] = 'aa', 593 a[11] = 'ae', 230 a[12] = 'ah', 652 a[13] = 'ao', 596 a[14] = 'aw', 593 a[15] = 'ax', 601 a[16] = 'ay', 593 a[17] = 'b', 98 a[18] = 'ch', 679 a[19] = 'd', 100 a[20] = 'dh', 240 a[21] = 'eh', 603 a[22] = 'er', 602 a[23] = 'ey', 101 a[24] = 'f', 102 a[25] = 'g', 103 a[26] = 'hh', 104 a[27] = 'ih', 618 a[28] = 'iy', 105 a[29] = 'jh', 676 a[30] = 'k', 107 a[31] = 'l', 108 a[32] = 'm', 109 a[33] = 'n', 110 a[34] = 'nx', 331 a[35] = 'ow', 111 a[36] = 'oy', 117 a[37] = 'p', 112 a[38] = 'r', 633 a[39] = 's', 115 a[40] = 'sh', 643 a[41] = 't', 116 a[42] = 'th', 952 a[43] = 'uh', 650 a[44] = 'uw', 117 a[45] = 'v', 118 a[46] = 'w', 119 a[47] = 'y', 106 a[48] = 'z', 122 a[49] = 'zh', 658 phonemeMapping = a def IntToHexString(val): '''Bit-twiddling to write sizes in little-endian to WAV headers''' ret = '' mask = 255 ret += chr(val & mask) ret += chr((val >> 8) & mask) ret += chr((val >> 16) & mask) ret += chr((val >> 24) & mask) return ret def RecordWAV(filename, text): '''This is the money function. The four calls are: write basic WAV, get word/phoneme timings, process word/phoneme timings into the syntax that Source expects, open the WAV again and append the phoneme timings and tweak the WAV headers.''' WriteBasicWAV(filename, text) wordTimings = GetTimings(text) processedTimings, length = ProcessTimings(wordTimings) AppendTimingsToWAV(filename, text, processedTimings) def WriteBasicWAV(filename, text): '''Just the basic write-to-WAV exposed by pyTTS''' tts.SpeakToWave(filename, text) def GetTimings(text): '''For whatever reason, SAPI doesn't seem to raise events when speaking to a WAV. So, we have to speak the text again, this time to memory (where it apparently evaporates away again.) We basically correct just the raw information returned by SAPI and return it in a list. There is an odd and irritating issue in that some voices report the first phoneme of a word, THEN the word, then the rest of the phonemes. I define these to be "Weird Voices", with Normal Voices being the voices that first signal a new word, then the phonemes within that word. I believe the voices that ship with XP (Microsoft Mary, Microsoft Sam, and Microsoft Mike) are all Weird Voices. The NeoSpeech voices, on the other hand, are not. This function was written before I realized the existence of Weird Voices. So, we use a weird little heuristic (starts with "MS", so is a Microsoft voice) to try and separate the Weird Voices from the Normal Ones. If they are weird, we run through the list again at the end and scoot all the phonemes down by one. If it looks like your phonemes are all off by a word (you can tell by opening the produced WAV in NotePad and scrolling to the very bottom), try setting bWeirdVoice and see if that corrects it.''' bWeirdVoice = False if fastTalker.Voice.startswith('MS'): bWeirdVoice = True print 'WEIRD!' wordTimings = [] events = fastTalker.Speak(text, pyTTS.tts_is_xml)[1] for event in events: if event.EventType == pyTTS.tts_event_word: wordTimings.append([text[event.CharacterPosition:event.CharacterPosition + event.Length]]) elif event.EventType == pyTTS.tts_event_phoneme: if bWeirdVoice and not wordTimings: wordTimings.append(['WILLGOAWAY']) wordTimings[-1].append([event.CurrentID, event.NextID, event.Duration]) if bWeirdVoice: i = 0 while i < len(wordTimings) - 1: wordTimings[i+1].insert(1, wordTimings[i][-1]) del wordTimings[i][-1] i += 1 return wordTimings[1:] return wordTimings def ProcessTimings(wordTimings): '''This function takes the timings returned by GetTimings and massages them into the format required by Source. You can open in one of the processed WAVs in a text editor and scroll to the end to the very end to see how the format works. The gist is that you list the words, their start and end times, and the start and end times of each phoneme within the word. This required converting from the relative timings return by SAPI to absolute timings. I don't know what the 1 at the end of the line is there for, but I'm guessing it's something to do with emphasis.''' processedTimings = [] totalTime = 0 for word in wordTimings: processedTimings.append([]) startTime = totalTime endTime = startTime for i in range(1, len(word)): #1 to skip word event at beginning of list timing = word[i] endTime += timing[2] processedTimings[-1].append('WORD %s %0.3f %0.3f' % (word[0], startTime / 1000.0, endTime / 1000.0)) processedTimings[-1].append('{') for i in range(1, len(word)): timing = word[i] if timing[0] in phonemeMapping: processedTimings[-1].append('%s %s %0.3f %0.3f 1' % (phonemeMapping[timing[0]][1], phonemeMapping[timing[0]][0], startTime / 1000.0, (startTime + timing[2]) / 1000.0)) startTime += timing[2] processedTimings[-1].append('}') totalTime = startTime return (processedTimings, totalTime / 1000.0) def AppendTimingsToWAV(filename, text, processedTimings): '''The issue here is that we need to append the phoneme information and some boiler plate to the end of the WAV file. Unfortunately, we then have to update the new size of the WAV file in two different places. (Google for "WAV header" for more information.) The first place is the main wav header, bytes 4:8. The other is the chunk size, and is the four bytes right after VDAT. It's complicated by the fact that I couldn't figure out how to get Python to write information to arbitrary places in a file. You either append, or write the whole file at once. So, I read the whole file in, tweak the values, and then write the whole file at once. Currently, I'm reading the file in as a series of 512b blocks. I don't know if this chunking is necessary (my concern is that really big WAVs wouldn't get read in as one block, so it's better to chunk it ourselves when we know what's going on), and it introduces a potential bug where the WAV chunk header we need to update actually spans two blocks. I think this is handled correctly in the code, but it hasn't really been tested (the code has been tested, but I haven't intentionally tested the case where the header spans blocks. Regardless, we read the file into 512 byte blocks. We then change the first header in the first block. The second header update is trickier because we don't know which block it's in or where in that block. So, we do some arithatic to figure that out, and we update the right place. It's conceivable, though, that the four byte update we need spans two blocks. So, if that is the case, we extend the first block and remove the appropriate number of opening bytes from the next block.''' #Have to open in binary or endlines get munged. out = open(filename, 'ab') endl = chr(10) #Write some Source boilerplate out.write(chr(86) + chr(68) + chr(65) + chr(84) + chr(0) + chr(0) + chr(0) + chr(0)) #Record size of file before we started adding phoneme stuff to the end. secondHeaderOffset = out.tell() out.write('VERSION 1.0' + endl) out.write('PLAINTEXT' + endl + '{' + endl + text + endl + '}' + endl) out.write('WORDS' + endl + '{' + endl) for word in processedTimings: #processedTimings is setup correctly already for line in word: out.write(line + endl) out.write('}' + endl) #Just do the most basic thing here with options. out.write('EMPHASIS' + endl + '{' + endl + '}' + endl + 'OPTIONS' + endl + '{' + endl + 'voice_duck 1' + endl + '}' + endl) out.close() #WAV now has all the phoneme information appended, but we still need to #update the size stored in the WAV headers. out = open(filename, 'rb') #Seek to the end of the file. out.seek(-1, 2) #Get our seek position (size of the file because we just seeked to end) totalFilesize = out.tell() - 4 #Don't quote me on this, but WAVs apparently want their sizes in multiples #of 4. I 'figured' this out by just looking at WAV headers, so please #correct me if I'm wrong. totalFilesize -= totalFilesize % 4 #Amount of stuff we added additionSize = out.tell() - secondHeaderOffset + 1 secondHeaderOffset -= 4 #correct for size of header(?) blocks = [] #Go back to start of file. out.seek(0) BLOCKSIZE = 512 while True: temp = out.read(BLOCKSIZE) if temp == '': break blocks.append(temp) out.close() #blocks now contains the entire WAV file in a list of 512 byte blocks. #Hooray for Python! This inserts our new four-byte size into the WAV #header. blocks[0] = blocks[0][0:4] + IntToHexString(totalFilesize) + blocks[0][8:] #This is the sketchier part, because we actually have to track down where #the chunk header is. blockNum = secondHeaderOffset // BLOCKSIZE #// always returns an int. blockOffset = secondHeaderOffset % BLOCKSIZE if BLOCKSIZE - blockOffset < 4: #This path is suspect. This is the path where the four-byte header #actually spans two blocks. countInFirstBlock = BLOCKSIZE - blockOffset blocks[blockNum] = blocks[blockNum][0:blockOffset] + IntToHexString(additionSize) blocks[blockNum+1] = blocks[blockNum+1][4 - coountInFirstBlock:] else: blocks[blockNum] = blocks[blockNum][0:blockOffset] + IntToHexString(additionSize) + blocks[blockNum][blockOffset+4:] #now to write this information to the two(!) headers out = open(filename, 'wb+') for block in blocks: out.write(block) out.close() def FindAll(text, sub): '''A little utility function that returns a list of all the indices where sub occurs in text.''' ret = [] nextIndex = -1 while True: nextIndex = text.find(sub, nextIndex + 1) if nextIndex == -1: return ret else: ret.append(nextIndex) def SplitSentence(sentence): '''Source doesn't seem to like WAV files that are too long (engine hitches, etc.) So this function takes a sentence, and returns a list of phrases that are all short enough (currently, less than 200 chars) to be used by the engine with no problem after being spoken to a WAV. This function tries to split along phrases (semicolons, commas, etc.) If the phrase is still too long, though, it will split in between words.''' if len(sentence) < 200: return [sentence] else: indices = FindAll(sentence, ',') #pull out commas that split numbers like 15,000 indices = [i for i in indices if not (sentence[i-1].isdigit() and sentence[i+1].isdigit())] if len(indices) > 0: #we will break nicely at a comma #The next few lines are a little thing to try and pick commas with #the most distance between them (so we get the most size reduction #for our split. indices = [0] + indices + [len(indices)] distances = [abs(indices[i-1] - indices[i]) + abs(indices[i] - indices[i+1]) for i in range(1, len(indices)-1)] indices = indices[1:-1] #indices is now a list of indices of commas in the too-long #sentence #distances is a corresponding list, where distance[i] is the #distance in chars between the comma at indices[i] and the #surrounding commas. The idea is that picking the largest distance #will be the best bet at actually splitting phrases, and not things #like 'Blah blah, president on NarNar, said on...' splitPoint = indices[distances.index(max(distances))] else: #no commas, we'll have to settle for breaking between words splitPoint = len(sentence) / 2 #scan until word break at least. while sentence[splitPoint] != ' ' and sentence[splitPoint] != '!'\ and sentence[splitPoint] != ':' and sentence[splitPoint] != '?'\ and splitPoint < len(sentence): splitPoint += 1 #cute little qsort looking expression ret = SplitSentence(sentence[0:splitPoint]) + SplitSentence(sentence[splitPoint + 1:]) return ret def ValidSentence(text): '''Returns true if text is a valid sentence or vaguely sentence-like construction''' if text == '': return False for letter in text: if letter.isalpha(): return True return False def SplitTextIntoSentences(text): '''This takes a block of text, splits it into shorter blocks that are easier for TTS and HL2, and returns a list of the smaller blocks. This function does the obvious splitting (on periods and semi-colons) before passing the sentences through SplitSentence which tries to be clever about recursively splitting on commas, etc.''' sentences = text.split('.') sentences = [s + '.' for s in sentences if s != ''] #This next test will catch a few instances of periods in abbreviations, etc MIN_SENTENCE_LENGTH = 25 for i in range(len(sentences)-1): if len(sentences[i]) < MIN_SENTENCE_LENGTH: sentences[i+1] = sentences[i] + sentences[i+1] sentences[i] = '' if len(sentences[-1]) < MIN_SENTENCE_LENGTH: sentences[-2] += sentences[-1] sentences[-1] = '' sentences = [s for s in sentences if s != ''] sentences2 = [] for sentence in sentences: sentences2 += sentence.split(';') sentences = sentences2 sentences3 = [] for sentence in sentences: sentences3 += SplitSentence(sentence) sentences = sentences3 return [s.strip() for s in sentences if ValidSentence(s)] def StraightForwardRecord(text, filename, voice = '', split=False): '''This is the main driver function. filename should be just the beginning of a filename, on the order of 'c:\\test'. The .wav will be filled in automatically.''' oldVoice = tts.GetVoice() if voice != '': tts.SetVoiceByName(voice) fastTalker.SetVoiceByName(voice) if split: sentences = SplitTextIntoSentences(text) else: sentences = [text] for i in range(len(sentences)): safeFilename = filename + str(i) + '.wav' if os.path.exists(safeFilename): raise '%s already exists and I\'m not smart enough to figure out a better name!' % safeFilename text = sentences[i] RecordWAV(safeFilename, text) print 'Successfully wrote "%s" to %s' % (text, safeFilename) tts.SetVoiceByName(oldVoice) fastTalker.SetVoiceByName(oldVoice) SetupMapping() tts = pyTTS.Create() tts.SetOutputFormat(44, 16, 2) fastTalker = pyTTS.sapi.SynthOnly() if __name__ == '__main__': # TODO: Make this a commandline script text = 'lamb' filename = 'c:\\wordLamb' voice = '' #voice = 'NeoSpeech Paul' #defaults to system voice split = False StraightForwardRecord("The quick brown fox jumped over the lazy dog", "c:\\test")