import os
import csv
import re
import pandas as pd
from pathlib import Path
import numpy as np
# functions to convert between different transcript/annotation formats

####### 
# "table" refers to a pd.Dataframe w the following cols
# [uttID, speaker, transcript, start_sec, end_sec]
######### 

# separate function to write to csv, tsv or ELAN compatible (ELAN interprets ALL commas as delimiter so we need to use tab instead)

def HHMMSS_to_sec(time_str):
    """Get Seconds from timestamp string with milliseconds."""
    if not time_str:
        return None
    if time_str.count(':')==2:
        h, m, s = time_str.split(':')
    elif time_str.count(':')==3:
    # weird timestamps where there is a field followign seconds delimited by colon
        h, m, s, u = time_str.split(':')
        # determine whether ms field is in tenths or hundredths or thousandths by countng how many digits
        if len(u)==1:
            print('Weird time format detected - HH:MM:SS:tenths - please verify this is how you want the time interpreted')
            ms = float(u)/10
        elif len(u)==2: # hundredths
            ms = float(u)/100
        elif len(u)==3: # hundredths
            ms = float(u)/1000
        else:
            print(f'input string format not supported: {time_str}')
            return None
        s = int(s)+ms
    elif time_str.count(':')==1:
        # print('missing HH from timestamp, assuming MM:SS')
        m, s = time_str.split(':')
        h=0
    elif time_str.count(':')==0 and time_str.count('.')==1:
        # print('missing HH:MM from timestamp, assuming SS.ms')
        s = float(time_str)
        h=0
        m=0
    else:
        print(f'input string format not supported: {time_str}')
        return None
    return int(h) * 3600 + int(m) * 60 + float(s) 

def sec_to_timecode(time_sec):
    # convert seconds to HH:MM:SS:hundredths as used in .xlsx transcripts
    h=int(time_sec//3600)
    m=int((time_sec-3600*h)//60)
    s=int(time_sec-3600*h-60*m)
    u=round(100*(time_sec-3600*h-60*m-s))
    timecode=f'{h}:{m:02}:{s:02}:{u:02}'
    return(timecode)

def docx_scraped_tsv_to_table(ooona_file):
    # ooona output is a table in a word docx, 
    # for now manually copying this out and saving as tsv
    # but the timestamp format is wrong
    # input cols are SHOT	START	END	SPEAKER	DIALOGUE

    with open(ooona_file) as in_file:
        reader = csv.reader(in_file, delimiter="\t")
        next(reader) # skip header
        rows=[]
        for i,line in enumerate(reader):
            utt_ix, start_time, end_time, speaker, transcript = line
            start_sec = HHMMSS_to_sec(start_time)
            end_sec = HHMMSS_to_sec(end_time)
            rows.append([utt_ix,speaker,transcript,start_sec,end_sec])
    utt_table = pd.DataFrame(rows, columns=['uttID','speaker','transcript','start_sec','end_sec'])
    return(utt_table)
    # table = pd.read_csv(ooona_file, sep='\t')

def molly_xlsx_to_table(xl_file):
    # contractor transcribers provide an xlsx with the following columns
    # utt_ix:	int
    # Timecode: "HH:MM:SS:ss - HH:MM:SS:ss"	
    # Duration:	HH:MM:SS:ss 
    # Speaker:	str
    # Dialogue:	str 
    # Annotations:	blank
    # Error Type: blank
    with pd.ExcelFile(xl_file) as xls:
        sheetname = xls.sheet_names
        table = pd.DataFrame(pd.read_excel(xls, sheetname[0]))
    table.columns=table.columns.str.lower()
    table[['start_time','end_time']] = table['timecode'].str.split('-',expand=True)
    table['start_sec'] = table['start_time'].str.strip().apply(HHMMSS_to_sec)
    table['end_sec'] = table['end_time'].str.strip().apply(HHMMSS_to_sec)
    table.drop(labels=['annotations','error type','duration'], axis=1, inplace=True)
    table=table[['#','speaker','dialogue','start_sec','end_sec']]
    table.rename(columns={'#':'uttID', 'dialogue':'transcript'}, inplace=True)
    table.reset_index(inplace=True,drop=True)
    table=table.replace('', np.nan).dropna(subset=['speaker','dialogue'], how='all') # drop rows with missing values in speaker and utterance
    return table

def LoFi_xlsx_to_table(xl_file):
# LoFi transcripts have the following columns:
#     # utt_ix:	int
    # Timecode: "HH:MM:SS:ss - HH:MM:SS:ss"	
    # Duration:	HH:MM:SS:ss 
    # Speaker:	str
    # Dialogue:	str 
    # Annotations:	blank
    # Error Type: blank
    with pd.ExcelFile(xl_file) as xls:
        sheetname = xls.sheet_names
        table = pd.DataFrame(pd.read_excel(xls, sheetname[0]))
    table[['start_time','end_time']] = table['Timecode'].str.split('-',expand=True)
    table['start_sec'] = table['start_time'].str.strip().apply(HHMMSS_to_sec)
    table['end_sec'] = table['end_time'].str.strip().apply(HHMMSS_to_sec)
    table.drop(labels=['Annotations','Error Type','Duration'], axis=1, inplace=True)
    table=table[['#','Speaker','Dialogue','start_sec','end_sec']]
    table.rename(columns={'#':'uttID','Speaker':'speaker', 'Dialogue':'transcript'}, inplace=True)

    return table

def saga_to_table(saga_txt):
    # saga's own transcripts are txt given in the following format
    # 
    # speaker (start time MM:SS)
    # utterance
    # <blank line>
    # TODO: make more robust by pattern matching instead of modulo
    with open(saga_txt) as in_file:
        reader = csv.reader(in_file, delimiter="\n")
        count = 0
        rows=[]
        for i,line in enumerate(reader):
            print((count,line))
            if count%3 == 0:
            # utt = utt.split('\n')  # now speaker (time) , transcript
                # transcript = utt[1]
                spk_time = line[0].split('(')
                if len(spk_time)<2:
                    # print('!!!speaker not changed')
                    # print(line)
                    timestamp = spk_time[0].strip('):( ')
                    speaker=rows[-1][0]   # prev speaker        

                else:
                    speaker = spk_time[0]    
                    timestamp = spk_time[1].replace('):','')             
                    # print(timestamp)
                start_sec = HHMMSS_to_sec(timestamp)

            if count%3 == 1:
                transcript = line[0]
            if count%3 == 2:
                rows.append([i,speaker,transcript,start_sec,None])
                #print([speaker,transcript,timestamp])
            count+=1
    utt_table = pd.DataFrame(rows, columns=['uttID','speaker','transcript','start_sec','end_sec'])
    return(utt_table)

def table_to_ELAN_tsv(table:pd.DataFrame, path:str):
    # write table to tsv compatible with ELAN import
    table.to_csv(path, index=False, float_format='%.3f',sep='\t')

def table_to_standard_csv(table:pd.DataFrame, path:str):
    # write table to standard csv format agreed upon by whole team

    # TODO: convert times in seconds back to HH:MM:SS? 
    # TODO: split utterances into sentences? 
    table.to_csv(path,index=False, float_format='%.3f')

def table_to_utt_labels_csv(table:pd.DataFrame, path:str):
    # write table to utt_labels csv format comaptable w rosy's isatasr lib
    table.rename(columns={'transcript':'utterance', 'uttID':'seg'}, inplace=True)
    table=table.replace('', np.nan).dropna(subset=['speaker','utterance'], how='all') # drop rows with missing values in speaker and utterance
    table.to_csv(path,index=False, float_format='%.3f')
    
def table_to_molly_xlsx(tbl:pd.DataFrame,path:str):
    tblx = tbl
    tblx.rename(columns={'uttID':'#', 'speaker':'Speaker','transcript':'Dialogue'}, inplace=True)
    tblx['dur_s'] = tblx['end_sec']-tblx['start_sec']
    tblx['start_timecode']=tblx['start_sec'].apply(sec_to_timecode)
    tblx['end_timecode']=tblx['end_sec'].apply(sec_to_timecode)
    tblx['Duration'] = tblx['dur_s'].apply(sec_to_timecode)
    tblx['Timecode'] = [' - '.join(i) for i in zip(tblx['start_timecode'], tblx['end_timecode'])]
    tblx['Annotations'] = ''
    tblx['Error Type'] = ''
    tblx=tblx[['#','Timecode','Duration','Speaker','Dialogue','Annotations','Error Type']]    
    tblx.to_excel(path,sheet_name=Path(path).stem, index=False)

def utt_labels_csv_to_table(label_csv:str):
    # utt_labels_csv is the usual format used for diarized, timed transcripts in this repo
    # There are several versions with differnt columns (with/without segment &/ utterance index)
    # table: 
    # [uttID, speaker, transcript, start_sec, end_sec]

    table = pd.read_csv(label_csv,keep_default_na=False)
    # choose which column to use for uttID in table
    if 'utt' in table.columns:
        table=table.rename(columns={"utt":"uttID"}).drop('seg', axis=1)
    elif 'seg' in table.columns:
        table=table.rename(columns={"seg":"uttID"})
    else: 
        table=table.reset_index().rename(columns={"index":"uttID"})

    return table