{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "61185b34-45e0-4a78-a84b-2cedd08ad39a", "metadata": {}, "outputs": [], "source": [ "# # Function to convert Hindi text to numerical representation\n", "# from isNumber import is_number\n", "\n", "# def text_to_int (textnum, numwords={}):\n", "# units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',\n", "# 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',\n", "# 'sixteen', 'seventeen', 'eighteen', 'nineteen']\n", "# tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']\n", "# scales = ['hundred', 'thousand', 'lac','million', 'billion', 'trillion']\n", "# ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}\n", "# ordinal_endings = [('ieth', 'y'), ('th', '')]\n", "\n", "# if not numwords:\n", "# numwords['and'] = (1, 0)\n", "# for idx, word in enumerate(units): numwords[word] = (1, idx)\n", "# for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)\n", "# for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)\n", "\n", "# textnum = textnum.replace('-', ' ')\n", "\n", "# current = result = 0\n", "# curstring = ''\n", "# onnumber = False\n", "# lastunit = False\n", "# lastscale = False\n", "\n", "# def is_numword(x):\n", "# if is_number(x):\n", "# return True\n", "# if word in numwords:\n", "# return True\n", "# return False\n", "\n", "# def from_numword(x):\n", "# if is_number(x):\n", "# scale = 0\n", "# increment = int(x.replace(',', ''))\n", "# return scale, increment\n", "# return numwords[x]\n", "\n", "# for word in textnum.split():\n", "# if word in ordinal_words:\n", "# scale, increment = (1, ordinal_words[word])\n", "# current = current * scale + increment\n", "# if scale > 100:\n", "# result += current\n", "# current = 0\n", "# onnumber = True\n", "# lastunit = False\n", "# lastscale = False\n", "# else:\n", "# for ending, replacement in ordinal_endings:\n", "# if word.endswith(ending):\n", "# word = \"%s%s\" % (word[:-len(ending)], replacement)\n", "\n", "# if (not is_numword(word)) or (word == 'and' and not lastscale):\n", "# if onnumber:\n", "# # Flush the current number we are building\n", "# curstring += repr(result + current) + \" \"\n", "# curstring += word + \" \"\n", "# result = current = 0\n", "# onnumber = False\n", "# lastunit = False\n", "# lastscale = False\n", "# else:\n", "# scale, increment = from_numword(word)\n", "# onnumber = True\n", "\n", "# if lastunit and (word not in scales): \n", "# # Assume this is part of a string of individual numbers to \n", "# # be flushed, such as a zipcode \"one two three four five\" \n", "# curstring += repr(result + current) \n", "# result = current = 0 \n", "\n", "# if scale > 1: \n", "# current = max(1, current) \n", "\n", "# current = current * scale + increment \n", "# if scale > 100: \n", "# result += current \n", "# current = 0 \n", "\n", "# lastscale = False \n", "# lastunit = False \n", "# if word in scales: \n", "# lastscale = True \n", "# elif word in units: \n", "# lastunit = True\n", "\n", "# if onnumber:\n", "# curstring += repr(result + current)\n", "\n", "# return curstring\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a87b26d7-4a0e-4fdc-b03e-1537600faf65", "metadata": {}, "outputs": [], "source": [ "from isNumber import is_number # Remove or replace this if unnecessary\n", "\n", "def text_to_int(textnum, numwords={}):\n", " # Define units, tens, and scales including \"lac\"\n", " units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',\n", " 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',\n", " 'sixteen', 'seventeen', 'eighteen', 'nineteen']\n", " tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']\n", " scales = ['hundred', 'thousand', 'lac', 'million', 'billion', 'trillion'] # \"lac\" added\n", " ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12}\n", " ordinal_endings = [('ieth', 'y'), ('th', '')]\n", "\n", " if not numwords:\n", " numwords['and'] = (1, 0) # Handle \"one hundred and twenty\"\n", " \n", " # Add units, tens, and scales to numwords\n", " for idx, word in enumerate(units):\n", " numwords[word] = (1, idx)\n", " for idx, word in enumerate(tens):\n", " numwords[word] = (1, idx * 10)\n", " \n", " for idx, word in enumerate(scales):\n", " numwords[word] = (10 ** (5 if word == 'lac' else idx * 3 or 2), 0) # Handle \"lac\" as 10^5\n", "\n", " # Remove hyphens and normalize input\n", " textnum = textnum.replace('-', ' ')\n", "\n", " current = result = 0\n", " curstring = ''\n", " onnumber = False\n", " lastunit = False\n", " lastscale = False\n", "\n", " def is_numword(x):\n", " return is_number(x) or x in numwords\n", "\n", " def from_numword(x):\n", " if is_number(x):\n", " return 0, int(x.replace(',', ''))\n", " return numwords[x]\n", "\n", " for word in textnum.split():\n", " if word in ordinal_words:\n", " scale, increment = (1, ordinal_words[word])\n", " current = current * scale + increment\n", " if scale > 100:\n", " result += current\n", " current = 0\n", " onnumber = True\n", " lastunit = False\n", " lastscale = False\n", " else:\n", " for ending, replacement in ordinal_endings:\n", " if word.endswith(ending):\n", " word = f\"{word[:-len(ending)]}{replacement}\"\n", "\n", " if not is_numword(word) or (word == 'and' and not lastscale):\n", " if onnumber:\n", " curstring += repr(result + current) + \" \"\n", " curstring += word + \" \"\n", " result = current = 0\n", " onnumber = False\n", " lastunit = False\n", " lastscale = False\n", " else:\n", " scale, increment = from_numword(word)\n", " onnumber = True\n", "\n", " if lastunit and word not in scales:\n", " curstring += repr(result + current) + \" \"\n", " result = current = 0\n", "\n", " if scale > 1:\n", " current = max(1, current)\n", "\n", " current = current * scale + increment\n", "\n", " if scale >= 100:\n", " result += current\n", " current = 0\n", "\n", " lastscale = word in scales\n", " lastunit = word in units\n", "\n", " if onnumber:\n", " curstring += repr(result + current)\n", "\n", " return curstring.strip()" ] }, { "cell_type": "code", "execution_count": null, "id": "83997c73-e1b4-4863-b1df-d6de6153e80d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 }