{\rtf1\ansi\ansicpg1252\cocoartf1404\cocoasubrtf470
{\fonttbl\f0\fnil\fcharset0 Menlo-Regular;}
{\colortbl;\red255\green255\blue255;\red0\green0\blue0;\red183\green111\blue179;\red202\green202\blue202;
\red70\green137\blue204;\red212\green214\blue154;\red140\green211\blue254;\red194\green126\blue101;\red167\green197\blue152;
\red205\green173\blue106;}
\paperw11900\paperh16840\margl1440\margr1440\vieww27240\viewh17140\viewkind0
\deftab720
\pard\pardeftab720\sl360\partightenfactor0

\f0\fs24 \cf2 \expnd0\expndtw0\kerning0
\outl0\strokewidth0 \strokec3 from\strokec4  tensorflow.python.platform \strokec3 import\strokec4  gfile\
\strokec3 import\strokec4  os, shutil, pandas, re, unicodedata, sys\
\
\
\pard\pardeftab720\sl360\partightenfactor0
\cf2 \strokec5 def\strokec4  \strokec6 j\strokec4 (\strokec7 path\strokec4 , \strokec7 gender\strokec4 ):\
    path_gender = os.path.join(path, gender)\
    path_gender_wav = os.path.join(path_gender, \strokec8 'wav'\strokec4 )\
    path_gender_csv = os.path.join(path_gender, \strokec8 'data.csv'\strokec4 )\
    \strokec3 try\strokec4 :\
        os.mkdir(path_gender_wav)\
    \strokec3 except\strokec4 :\
        \strokec3 pass\strokec4 \
    csv = \strokec6 open\strokec4 (path_gender_csv, \strokec8 'a'\strokec4 )\
    dir_gender = \strokec6 sorted\strokec4 (os.listdir(path_gender))\
    i = j = \strokec9 0\strokec4 \
    \strokec3 for\strokec4  person \strokec5 in\strokec4  dir_gender:\
        path_person = os.path.join(path_gender, person)\
        \strokec3 if\strokec4  os.path.isdir(path_person):\
            dir_person = \strokec6 sorted\strokec4 (os.listdir(path_person))\
            \strokec3 for\strokec4  name \strokec5 in\strokec4  dir_person:\
                path_name = os.path.join(path_person, name)\
                \strokec3 if\strokec4  os.path.isdir(path_name):\
                    path_name_wav = os.path.join(path_name, \strokec8 'wavs'\strokec4 )\
                    path_name_csv = os.path.join(path_name, \strokec8 'metadata.csv'\strokec4 )\
                    dir_name_wav = \strokec6 sorted\strokec4 (os.listdir(path_name_wav))\
                    \strokec3 for\strokec4  wav \strokec5 in\strokec4  dir_name_wav:\
                        path_wav = os.path.join(path_name_wav, wav)\
                        wav = \strokec8 "wav_\strokec5 %s\strokec8 _\strokec5 %d\strokec8 "\strokec4  % (gender, i) + \strokec8 '.wav'\strokec4 \
                        dst = os.path.join(path_gender_wav, wav)\
                        shutil.copy2(path_wav, dst)\
                        i += \strokec9 1\strokec4 \
                    ncsv = \strokec6 open\strokec4 (path_name_csv, \strokec8 'r'\strokec4 )\
                    lines = ncsv.readlines()\
                    lines.sort(\strokec7 key\strokec4 =\strokec5 lambda\strokec4  \strokec7 n\strokec4 : re.sub(\strokec8 "[^a-zA-Z0-9\'e0\'e2\'e7\'e8\'e9\'ea\'eb\'ed\'ee\'ef\'f1\'f4\'f6\'f9\'fb\'fc\'ff\'9c\'92'-_,.]"\strokec4 ,\strokec8 ""\strokec4 , n.split(\strokec8 '|'\strokec4 )[\strokec9 0\strokec4 ]))\
                    \strokec3 for\strokec4  line \strokec5 in\strokec4  lines:\
                        l = \strokec8 "|"\strokec4 .join([\strokec8 "wav_\strokec5 %s\strokec8 _\strokec5 %d\strokec8 "\strokec4  % (gender, j), line.split(\strokec8 '|'\strokec4 )[\strokec9 1\strokec4 ].lower()]) + \strokec8 '\strokec10 \\n\strokec8 '\strokec4 \
                        csv.write(l)\
                        j += \strokec9 1\strokec4 \
                    ncsv.close()\
    csv.close()\
    path_gender_train = os.path.join(path_gender, \strokec8 'train'\strokec4 )\
    path_gender_train_csv = os.path.join(path_gender, \strokec8 'train.csv'\strokec4 )\
    path_gender_test = os.path.join(path_gender, \strokec8 'test'\strokec4 )\
    path_gender_test_csv = os.path.join(path_gender, \strokec8 'test.csv'\strokec4 )\
    path_gender_dev = os.path.join(path_gender, \strokec8 'dev'\strokec4 )\
    path_gender_dev_csv = os.path.join(path_gender, \strokec8 'dev.csv'\strokec4 )\
    \strokec3 try\strokec4 :\
        os.mkdir(path_gender_train)\
    \strokec3 except\strokec4 :\
        \strokec3 pass\strokec4 \
    \strokec3 try\strokec4 :\
        os.mkdir(path_gender_test)\
    \strokec3 except\strokec4 :\
        \strokec3 pass\strokec4 \
    \strokec3 try\strokec4 :\
        os.mkdir(path_gender_dev)\
    \strokec3 except\strokec4 :\
        \strokec3 pass\strokec4 \
    csv = \strokec6 open\strokec4 (path_gender_csv, \strokec8 'r'\strokec4 )\
    lines = csv.readlines()\
    lines.sort()\
    nb_dev = (\strokec6 len\strokec4 (lines) * \strokec9 20\strokec4 ) // \strokec9 100\strokec4 \
    nb_test = (\strokec6 len\strokec4 (lines) * \strokec9 10\strokec4 ) // \strokec9 100\strokec4 \
    nb_train = \strokec6 len\strokec4 (lines) - nb_dev - nb_test\
    \strokec3 for\strokec4  i \strokec5 in\strokec4  \strokec6 range\strokec4 (\strokec6 len\strokec4 (lines)):\
        name, sentence = lines[i].split(\strokec8 '|'\strokec4 )[\strokec9 0\strokec4 ] + \strokec8 '.wav'\strokec4 , lines[i].split(\strokec8 '|'\strokec4 )[\strokec9 1\strokec4 ]\
        path_wav = os.path.join(path_gender, \strokec8 'wav'\strokec4 , name)\
        \strokec3 if\strokec4  i == \strokec9 0\strokec4 :\
            files = []\
        \strokec3 elif\strokec4  i == nb_train:\
            dataframe = pandas.DataFrame(\strokec7 data\strokec4 =files, \strokec7 columns\strokec4 =[\strokec8 "wav_filename"\strokec4 , \strokec8 "wav_filesize"\strokec4 , \strokec8 "transcript"\strokec4 ])\
            dataframe.to_csv(path_gender_train_csv, \strokec7 index\strokec4 =\strokec5 False\strokec4 )\
            files = []\
        \strokec3 elif\strokec4  i == nb_train + nb_dev:\
            dataframe = pandas.DataFrame(\strokec7 data\strokec4 =files, \strokec7 columns\strokec4 =[\strokec8 "wav_filename"\strokec4 , \strokec8 "wav_filesize"\strokec4 , \strokec8 "transcript"\strokec4 ])\
            dataframe.to_csv(path_gender_dev_csv, \strokec7 index\strokec4 =\strokec5 False\strokec4 )\
            files = []\
        \strokec3 if\strokec4  i < nb_train:\
            \strokec3 try\strokec4 :\
                shutil.copy2(path_wav, path_gender_train)\
            \strokec3 except\strokec4 :\
                \strokec3 pass\strokec4 \
            wav_file = os.path.join(path_gender_train, name)\
        \strokec3 elif\strokec4  nb_train <= i < nb_train + nb_dev:\
            \strokec3 try\strokec4 :\
                shutil.copy2(path_wav, path_gender_dev)\
            \strokec3 except\strokec4 :\
                \strokec3 pass\strokec4 \
            wav_file = os.path.join(path_gender_dev, name)\
        \strokec3 else\strokec4 :\
            \strokec3 try\strokec4 :\
                shutil.copy2(path_wav, path_gender_test)\
            \strokec3 except\strokec4 :\
                \strokec3 pass\strokec4 \
            wav_file = os.path.join(path_gender_test, name)\
        sentence = re.sub(\strokec8 "[^a-zA-Z0-9\'e0\'e2\'e7\'e8\'e9\'ea\'eb\'ed\'ee\'ef\'f1\'f4\'f6\'f9\'fb\'fc\'ff\'9c\'92']"\strokec4 ,\strokec8 " "\strokec4 ,sentence.strip().lower())\
        transcript = \strokec8 ""\strokec4 \
        \strokec3 for\strokec4  token \strokec5 in\strokec4  sentence.split(\strokec8 " "\strokec4 ):\
            word = token.strip()\
            \strokec3 if\strokec4  word!=\strokec8 ""\strokec4  \strokec5 and\strokec4  word!=\strokec8 " "\strokec4 :\
                transcript += word + \strokec8 " "\strokec4 \
        transcript = unicodedata.normalize(\strokec8 "NFKD"\strokec4 , transcript.strip()).encode(\strokec8 "ascii"\strokec4 , \strokec8 "ignore"\strokec4 ).decode(\strokec8 "ascii"\strokec4 , \strokec8 "ignore"\strokec4 )\
        \strokec3 if\strokec4  gfile.Exists(wav_file):\
            wav_filesize = os.path.getsize(wav_file)\
            \strokec3 if\strokec4  (wav_filesize/\strokec9 16000\strokec4 )>\strokec9 0.5\strokec4  \strokec5 and\strokec4  (wav_filesize/\strokec9 16000\strokec4 )<\strokec9 20\strokec4  \strokec5 and\strokec4  transcript!=\strokec8 ""\strokec4  \strokec5 and\strokec4  wav_filesize/\strokec6 len\strokec4 (transcript)>\strokec9 1400\strokec4 :\
                files.append((os.path.abspath(wav_file), wav_filesize, transcript))\
    dataframe = pandas.DataFrame(\strokec7 data\strokec4 =files, \strokec7 columns\strokec4 =[\strokec8 "wav_filename"\strokec4 , \strokec8 "wav_filesize"\strokec4 , \strokec8 "transcript"\strokec4 ])\
    dataframe.to_csv(path_gender_test_csv, \strokec7 index\strokec4 =\strokec5 False\strokec4 )\
\
\
\pard\pardeftab720\sl360\partightenfactor0
\cf2 \strokec3 if\strokec4  \strokec7 __name__\strokec4 ==\strokec8 "__main__"\strokec4 :\
    j(sys.argv[\strokec9 1\strokec4 ], sys.argv[\strokec9 2\strokec4 ])\
\
}