## Nate Nichols, August 14, 2006
## ndnichols@cs.northwestern.edu
## Information Laboratory, Northwestern University

import pyTTS
import time
import pygame
import shutil
import os
import win32api
import getopt
import sys

phonemeMapping = {}

def SetupMapping():
    '''The numbers returned by SAPI need to be converted before appending them
    to the WAV.  I got this conversion by brute-force looking through WAV files
    with the phonemes I needed.  I think they're all correct.'''
    global phonemeMapping
    a = {}
    a[10] = 'aa', 593
    a[11] = 'ae', 230
    a[12] = 'ah', 652
    a[13] = 'ao', 596
    a[14] = 'aw', 593
    a[15] = 'ax', 601
    a[16] = 'ay', 593
    a[17] = 'b', 98
    a[18] = 'ch', 679
    a[19] = 'd', 100
    a[20] = 'dh', 240
    a[21] = 'eh', 603
    a[22] = 'er', 602
    a[23] = 'ey', 101
    a[24] = 'f', 102
    a[25] = 'g', 103
    a[26] = 'hh', 104
    a[27] = 'ih', 618
    a[28] = 'iy', 105
    a[29] = 'jh', 676
    a[30] = 'k', 107
    a[31] = 'l', 108
    a[32] = 'm', 109
    a[33] = 'n', 110
    a[34] = 'nx', 331
    a[35] = 'ow', 111
    a[36] = 'oy', 117
    a[37] = 'p', 112
    a[38] = 'r', 633
    a[39] = 's', 115
    a[40] = 'sh', 643
    a[41] = 't', 116
    a[42] = 'th', 952
    a[43] = 'uh', 650
    a[44] = 'uw', 117
    a[45] = 'v', 118
    a[46] = 'w', 119
    a[47] = 'y', 106
    a[48] = 'z', 122
    a[49] = 'zh', 658
    phonemeMapping = a

def IntToHexString(val):
    '''Bit-twiddling to write sizes in little-endian to WAV headers'''
    ret = ''
    mask = 255
    ret += chr(val & mask)
    ret += chr((val >> 8) & mask)
    ret += chr((val >> 16) & mask)
    ret += chr((val >> 24) & mask)
    return ret

def RecordWAV(filename, text):
    '''This is the money function.  The four calls are: write basic
    WAV, get word/phoneme timings, process word/phoneme timings into the syntax
    that Source expects, open the WAV again and append the phoneme timings
    and tweak the WAV headers.'''
    WriteBasicWAV(filename, text)
    wordTimings = GetTimings(text)
    processedTimings, length = ProcessTimings(wordTimings)
    AppendTimingsToWAV(filename, text, processedTimings)

def WriteBasicWAV(filename, text):
    '''Just the basic write-to-WAV exposed by pyTTS'''
    tts.SpeakToWave(filename, text)

def GetTimings(text):
    '''For whatever reason, SAPI doesn't seem to raise events when speaking
    to a WAV.  So, we have to speak the text again, this time to memory (where
    it apparently evaporates away again.)  We basically correct just the raw
    information returned by SAPI and return it in a list.
    There is an odd and irritating issue in that some voices report the first
    phoneme of a word, THEN the word, then the rest of the phonemes.  I define
    these to be "Weird Voices", with Normal Voices being the voices that first
    signal a new word, then the phonemes within that word.  I believe the
    voices that ship with XP (Microsoft Mary, Microsoft Sam, and Microsoft Mike)
    are all Weird Voices.  The NeoSpeech voices, on the other hand, are not.
    This function was written before I realized the existence of Weird Voices.
    So, we use a weird little heuristic (starts with "MS", so is a Microsoft
    voice) to try and separate the Weird Voices from the Normal Ones.  If they
    are weird, we run through the list again at the end and scoot all the
    phonemes down by one.  If it looks like your phonemes are all off by a word
    (you can tell by opening the produced WAV in NotePad and scrolling to the
    very bottom), try setting bWeirdVoice and see if that corrects it.'''
    bWeirdVoice = False
    if fastTalker.Voice.startswith('MS'):
        bWeirdVoice = True
        print 'WEIRD!'
    wordTimings = []
    events = fastTalker.Speak(text, pyTTS.tts_is_xml)[1]
    for event in events:
        if event.EventType == pyTTS.tts_event_word:
            wordTimings.append([text[event.CharacterPosition:event.CharacterPosition + event.Length]])
        elif event.EventType == pyTTS.tts_event_phoneme:
            if bWeirdVoice and not wordTimings:
                wordTimings.append(['WILLGOAWAY'])
            wordTimings[-1].append([event.CurrentID, event.NextID, event.Duration])
    if bWeirdVoice:
        i = 0
        while i < len(wordTimings) - 1:
            wordTimings[i+1].insert(1, wordTimings[i][-1])
            del wordTimings[i][-1]
            i += 1
        return wordTimings[1:]
    return wordTimings

def ProcessTimings(wordTimings):
    '''This function takes the timings returned by GetTimings and massages them
    into the format required by Source.  You can open in one of the
    processed WAVs in a text editor and scroll to the end to the very end to
    see how the format works.  The gist is that you list the words, their start
    and end times, and the start and end times of each phoneme within the word.
    This required converting from the relative timings return by SAPI to
    absolute timings.  I don't know what the 1 at the end of the line is there
    for, but I'm guessing it's something to do with emphasis.'''
    processedTimings = []
    totalTime = 0
    for word in wordTimings:
        processedTimings.append([])
        startTime = totalTime
        endTime = startTime
        for i in range(1, len(word)): #1 to skip word event at beginning of list
            timing = word[i]
            endTime += timing[2]
        processedTimings[-1].append('WORD %s %0.3f %0.3f' % (word[0], startTime / 1000.0, endTime / 1000.0))
        processedTimings[-1].append('{')
        for i in range(1, len(word)):
            timing = word[i]
            if timing[0] in phonemeMapping:
                processedTimings[-1].append('%s %s %0.3f %0.3f 1' % (phonemeMapping[timing[0]][1], phonemeMapping[timing[0]][0], startTime / 1000.0, (startTime + timing[2]) / 1000.0))
            startTime += timing[2]
        processedTimings[-1].append('}')
        totalTime = startTime
    return (processedTimings, totalTime / 1000.0)


def AppendTimingsToWAV(filename, text, processedTimings):
    '''The issue here is that we need to append the phoneme information and
    some boiler plate to the end of the WAV file.  Unfortunately, we then have
    to update the new size of the WAV file in two different places.  (Google
    for "WAV header" for more information.)  The first place is the main wav
    header, bytes 4:8.  The other is the chunk size, and is the four bytes
    right after VDAT.
    It's complicated by the fact that I couldn't figure out how to get Python
    to write information to arbitrary places in a file.  You either append, or
    write the whole file at once.  So, I read the whole file in, tweak the
    values, and then write the whole file at once.  Currently, I'm reading the
    file in as a series of 512b blocks.  I don't know if this chunking is
    necessary (my concern is that really big WAVs wouldn't get read in as one
    block, so it's better to chunk it ourselves when we know what's going on),
    and it introduces a potential bug where the WAV chunk header we need to
    update actually spans two blocks.  I think this is handled correctly in the
    code, but it hasn't really been tested (the code has been tested, but I
    haven't intentionally tested the case where the header spans blocks.
    Regardless, we read the file into 512 byte blocks.  We then change the
    first header in the first block.  The second header update is trickier
    because we don't know which block it's in or where in that block.  So, we
    do some arithatic to figure that out, and we update the right place.  It's
    conceivable, though, that the four byte update we need spans two blocks.
    So, if that is the case, we extend the first block and remove the
    appropriate number of opening bytes from the next block.'''

    #Have to open in binary or endlines get munged.
    out = open(filename, 'ab')
    endl = chr(10)
    #Write some Source boilerplate
    out.write(chr(86) + chr(68) + chr(65) + chr(84) + chr(0) + chr(0) + chr(0) + chr(0))
    #Record size of file before we started adding phoneme stuff to the end.
    secondHeaderOffset = out.tell()
    out.write('VERSION 1.0' + endl)
    out.write('PLAINTEXT' + endl + '{' + endl + text + endl + '}' + endl)
    out.write('WORDS' + endl + '{' + endl)
    for word in processedTimings: #processedTimings is setup correctly already
        for line in word:
            out.write(line + endl)
    out.write('}' + endl)
    #Just do the most basic thing here with options.
    out.write('EMPHASIS' + endl + '{' + endl + '}' + endl + 'OPTIONS' + endl + '{' + endl + 'voice_duck 1' + endl + '}' + endl)
    out.close()
    #WAV now has all the phoneme information appended, but we still need to
    #update the size stored in the WAV headers.
    out = open(filename, 'rb')
    #Seek to the end of the file.
    out.seek(-1, 2)
    #Get our seek position (size of the file because we just seeked to end)
    totalFilesize = out.tell() - 4
    #Don't quote me on this, but WAVs apparently want their sizes in multiples
    #of 4.  I 'figured' this out by just looking at WAV headers, so please
    #correct me if I'm wrong.
    totalFilesize -= totalFilesize % 4
    #Amount of stuff we added
    additionSize = out.tell() - secondHeaderOffset + 1
    secondHeaderOffset -= 4 #correct for size of header(?)
    blocks = []
    #Go back to start of file.
    out.seek(0)
    BLOCKSIZE = 512
    while True:
        temp = out.read(BLOCKSIZE)
        if temp == '':
            break
        blocks.append(temp)
    out.close()
    #blocks now contains the entire WAV file in a list of 512 byte blocks.

    #Hooray for Python!  This inserts our new four-byte size into the WAV
    #header.
    blocks[0] = blocks[0][0:4] + IntToHexString(totalFilesize) + blocks[0][8:]

    #This is the sketchier part, because we actually have to track down where
    #the chunk header is.
    blockNum = secondHeaderOffset // BLOCKSIZE  #// always returns an int.
    blockOffset = secondHeaderOffset % BLOCKSIZE
    if BLOCKSIZE - blockOffset < 4:
        #This path is suspect.  This is the path where the four-byte header
        #actually spans two blocks.
        countInFirstBlock = BLOCKSIZE - blockOffset
        blocks[blockNum] = blocks[blockNum][0:blockOffset] + IntToHexString(additionSize)
        blocks[blockNum+1] = blocks[blockNum+1][4 - coountInFirstBlock:]
    else:
        blocks[blockNum] = blocks[blockNum][0:blockOffset] + IntToHexString(additionSize) + blocks[blockNum][blockOffset+4:]

    #now to write this information to the two(!) headers
    out = open(filename, 'wb+')
    for block in blocks:
        out.write(block)
    out.close()

def FindAll(text, sub):
    '''A little utility function that returns a list of all the indices where
    sub occurs in text.'''
    ret = []
    nextIndex = -1
    while True:
        nextIndex = text.find(sub, nextIndex + 1)
        if nextIndex == -1:
            return ret
        else:
            ret.append(nextIndex)

def SplitSentence(sentence):
    '''Source doesn't seem to like WAV files that are too long (engine hitches,
    etc.)  So this function takes a sentence, and returns a list of
    phrases that are all short enough (currently, less than 200 chars) to be
    used by the engine with no problem after being spoken to a WAV.  This
    function tries to split along phrases (semicolons, commas, etc.)  If the
    phrase is still too long, though, it will split in between words.'''
    if len(sentence) < 200:
        return [sentence]
    else:
        indices = FindAll(sentence, ',')
        #pull out commas that split numbers like 15,000
        indices = [i for i in indices if not (sentence[i-1].isdigit() and sentence[i+1].isdigit())]
        if len(indices) > 0: #we will break nicely at a comma
            #The next few lines are a little thing to try and pick commas with
            #the most distance between them (so we get the most size reduction
            #for our split.
            indices = [0] + indices + [len(indices)]
            distances = [abs(indices[i-1] - indices[i]) + abs(indices[i] -
                        indices[i+1]) for i in range(1, len(indices)-1)]
            indices = indices[1:-1]
            #indices is now a list of indices of commas in the too-long
            #sentence
            #distances is a corresponding list, where distance[i] is the
            #distance in chars between the comma at indices[i] and the
            #surrounding commas.  The idea is that picking the largest distance
            #will be the best bet at actually splitting phrases, and not things
            #like 'Blah blah, president on NarNar, said on...'
            splitPoint = indices[distances.index(max(distances))]
        else: #no commas, we'll have to settle for breaking between words
            splitPoint = len(sentence) / 2
            #scan until word break at least.
            while sentence[splitPoint] != ' ' and sentence[splitPoint] != '!'\
                and sentence[splitPoint] != ':' and sentence[splitPoint] != '?'\
                and splitPoint < len(sentence):
                splitPoint += 1

        #cute little qsort looking expression
        ret = SplitSentence(sentence[0:splitPoint]) + SplitSentence(sentence[splitPoint + 1:])
        return ret

def ValidSentence(text):
    '''Returns true if text is a valid sentence or vaguely sentence-like construction'''
    if text == '':
        return False
    for letter in text:
        if letter.isalpha():
            return True
    return False

def SplitTextIntoSentences(text):
    '''This takes a block of text, splits it into shorter blocks that are
    easier for TTS and HL2, and returns a list of the smaller blocks.  This
    function does the obvious splitting (on periods and semi-colons) before
    passing the sentences through SplitSentence which tries to be clever about
    recursively splitting on commas, etc.'''
    sentences = text.split('.')
    sentences = [s + '.' for s in sentences if s != '']
    #This next test will catch a few instances of periods in abbreviations, etc
    MIN_SENTENCE_LENGTH = 25
    for i in range(len(sentences)-1):
        if len(sentences[i]) < MIN_SENTENCE_LENGTH:
            sentences[i+1] = sentences[i] + sentences[i+1]
            sentences[i] = ''
    if len(sentences[-1]) < MIN_SENTENCE_LENGTH:
        sentences[-2] += sentences[-1]
        sentences[-1] = ''
    sentences = [s for s in sentences if s != '']
    sentences2 = []
    for sentence in sentences:
        sentences2 += sentence.split(';')
    sentences = sentences2
    sentences3 = []
    for sentence in sentences:
        sentences3 += SplitSentence(sentence)
    sentences = sentences3
    return [s.strip() for s in sentences if ValidSentence(s)]

def StraightForwardRecord(text, filename, voice = '', split=False):
    '''This is the main driver function.  filename should be just the beginning
    of a filename, on the order of 'c:\\test'.  The .wav will be filled in
    automatically.'''
    oldVoice = tts.GetVoice()
    if voice != '':
        tts.SetVoiceByName(voice)
        fastTalker.SetVoiceByName(voice)
    if split:
        sentences = SplitTextIntoSentences(text)
    else:
        sentences = [text]
    for i in range(len(sentences)):
        safeFilename = filename + str(i) + '.wav'
        if os.path.exists(safeFilename):
            raise '%s already exists and I\'m not smart enough to figure out a better name!' % safeFilename
        text = sentences[i]
        RecordWAV(safeFilename, text)
        print 'Successfully wrote "%s" to %s' % (text, safeFilename)
    tts.SetVoiceByName(oldVoice)
    fastTalker.SetVoiceByName(oldVoice)


SetupMapping()
tts = pyTTS.Create()
tts.SetOutputFormat(44, 16, 2)
fastTalker = pyTTS.sapi.SynthOnly()

if __name__ == '__main__':
    # TODO: Make this a commandline script
    text = 'lamb'
    filename = 'c:\\wordLamb'
    voice = ''
    #voice = 'NeoSpeech Paul' #defaults to system voice
    split = False
    StraightForwardRecord("The quick brown fox jumped over the lazy dog", "c:\\test")