one
を1
に、two
を2
などに変換する必要があります。
ライブラリまたはクラスなどでこれを行う方法はありますか?
このコードの大部分は、numwords dictをセットアップすることです。これは、最初の呼び出しでのみ行われます。
def text2int(textnum, numwords={}):
if not numwords:
units = [
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
scales = ["hundred", "thousand", "million", "billion", "trillion"]
numwords["and"] = (1, 0)
for idx, Word in enumerate(units): numwords[Word] = (1, idx)
for idx, Word in enumerate(tens): numwords[Word] = (1, idx * 10)
for idx, Word in enumerate(scales): numwords[Word] = (10 ** (idx * 3 or 2), 0)
current = result = 0
for Word in textnum.split():
if Word not in numwords:
raise Exception("Illegal Word: " + Word)
scale, increment = numwords[Word]
current = current * scale + increment
if scale > 100:
result += current
current = 0
return result + current
print text2int("seven billion one hundred million thirty one thousand three hundred thirty seven")
#7100031337
興味がある人は、文字列の残りを保持するバージョンをハックしました(バグがあるかもしれませんが、あまりテストしていません)。
def text2int (textnum, numwords={}):
if not numwords:
units = [
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
scales = ["hundred", "thousand", "million", "billion", "trillion"]
numwords["and"] = (1, 0)
for idx, Word in enumerate(units): numwords[Word] = (1, idx)
for idx, Word in enumerate(tens): numwords[Word] = (1, idx * 10)
for idx, Word in enumerate(scales): numwords[Word] = (10 ** (idx * 3 or 2), 0)
ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
ordinal_endings = [('ieth', 'y'), ('th', '')]
textnum = textnum.replace('-', ' ')
current = result = 0
curstring = ""
onnumber = False
for Word in textnum.split():
if Word in ordinal_words:
scale, increment = (1, ordinal_words[Word])
current = current * scale + increment
if scale > 100:
result += current
current = 0
onnumber = True
else:
for ending, replacement in ordinal_endings:
if Word.endswith(ending):
Word = "%s%s" % (Word[:-len(ending)], replacement)
if Word not in numwords:
if onnumber:
curstring += repr(result + current) + " "
curstring += Word + " "
result = current = 0
onnumber = False
else:
scale, increment = numwords[Word]
current = current * scale + increment
if scale > 100:
result += current
current = 0
onnumber = True
if onnumber:
curstring += repr(result + current)
return curstring
例:
>>> text2int("I want fifty five hot dogs for two hundred dollars.")
I want 55 hot dogs for 200 dollars.
たとえば、「$ 200」がある場合は問題が発生する可能性があります。しかし、これは本当に大変でした。
コードスニペットをありがとう...時間を大幅に節約できました!
序数の単語( "first"、 "second")、ハイフンでつながれた単語( "one-百")、ハイフンでつながれたordinalの単語( "fifty-seventh"など)を追加する必要がありました。数行:
def text2int(textnum, numwords={}):
if not numwords:
units = [
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
scales = ["hundred", "thousand", "million", "billion", "trillion"]
numwords["and"] = (1, 0)
for idx, Word in enumerate(units): numwords[Word] = (1, idx)
for idx, Word in enumerate(tens): numwords[Word] = (1, idx * 10)
for idx, Word in enumerate(scales): numwords[Word] = (10 ** (idx * 3 or 2), 0)
ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
ordinal_endings = [('ieth', 'y'), ('th', '')]
textnum = textnum.replace('-', ' ')
current = result = 0
for Word in textnum.split():
if Word in ordinal_words:
scale, increment = (1, ordinal_words[Word])
else:
for ending, replacement in ordinal_endings:
if Word.endswith(ending):
Word = "%s%s" % (Word[:-len(ending)], replacement)
if Word not in numwords:
raise Exception("Illegal Word: " + Word)
scale, increment = numwords[Word]
current = current * scale + increment
if scale > 100:
result += current
current = 0
return result + current`
私はちょうどpythonモジュールを Word2number と呼ばれるPyPIにリリースしました。正確な目的のためです。 https://github.com/akshaynagpal/w2n
以下を使用してインストールします。
pip install Word2number
ピップが最新バージョンに更新されていることを確認してください。
使用法:
from Word2number import w2n
print w2n.Word_to_num("two million three thousand nine hundred and eighty four")
2003984
私の入力は音声からテキストへの変換からのものであり、解決策は常に数値を合計することではないため、少し異なるものが必要でした。たとえば、「私の郵便番号は1つ2つ3つ4つ5」は「私の郵便番号は15」に変換すべきではありません。
Andrewの answer を取り、エラーとしてハイライトされた他のいくつかのケースを処理するように調整し、上記のzipcodeのような例のサポートも追加しました。いくつかの基本的なテストケースを以下に示しますが、まだ改善の余地があると確信しています。
def is_number(x):
if type(x) == str:
x = x.replace(',', '')
try:
float(x)
except:
return False
return True
def text2int (textnum, numwords={}):
units = [
'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
'sixteen', 'seventeen', 'eighteen', 'nineteen',
]
tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
scales = ['hundred', 'thousand', 'million', 'billion', 'trillion']
ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
ordinal_endings = [('ieth', 'y'), ('th', '')]
if not numwords:
numwords['and'] = (1, 0)
for idx, Word in enumerate(units): numwords[Word] = (1, idx)
for idx, Word in enumerate(tens): numwords[Word] = (1, idx * 10)
for idx, Word in enumerate(scales): numwords[Word] = (10 ** (idx * 3 or 2), 0)
textnum = textnum.replace('-', ' ')
current = result = 0
curstring = ''
onnumber = False
lastunit = False
lastscale = False
def is_numword(x):
if is_number(x):
return True
if Word in numwords:
return True
return False
def from_numword(x):
if is_number(x):
scale = 0
increment = int(x.replace(',', ''))
return scale, increment
return numwords[x]
for Word in textnum.split():
if Word in ordinal_words:
scale, increment = (1, ordinal_words[Word])
current = current * scale + increment
if scale > 100:
result += current
current = 0
onnumber = True
lastunit = False
lastscale = False
else:
for ending, replacement in ordinal_endings:
if Word.endswith(ending):
Word = "%s%s" % (Word[:-len(ending)], replacement)
if (not is_numword(Word)) or (Word == 'and' and not lastscale):
if onnumber:
# Flush the current number we are building
curstring += repr(result + current) + " "
curstring += Word + " "
result = current = 0
onnumber = False
lastunit = False
lastscale = False
else:
scale, increment = from_numword(Word)
onnumber = True
if lastunit and (Word not in scales):
# Assume this is part of a string of individual numbers to
# be flushed, such as a zipcode "one two three four five"
curstring += repr(result + current)
result = current = 0
if scale > 1:
current = max(1, current)
current = current * scale + increment
if scale > 100:
result += current
current = 0
lastscale = False
lastunit = False
if Word in scales:
lastscale = True
Elif Word in units:
lastunit = True
if onnumber:
curstring += repr(result + current)
return curstring
いくつかのテスト...
one two three -> 123
three forty five -> 345
three and forty five -> 3 and 45
three hundred and forty five -> 345
three hundred -> 300
twenty five hundred -> 2500
three thousand and six -> 3006
three thousand six -> 3006
nineteenth -> 19
twentieth -> 20
first -> 1
my Zip is one two three four five -> my Zip is 12345
nineteen ninety six -> 1996
fifty-seventh -> 57
one million -> 1000000
first hundred -> 100
I will buy the first thousand -> I will buy the 1000 # probably should leave ordinal in the string
thousand -> 1000
hundred and six -> 106
1 million -> 1000000
これは、最初の回答のコードのc#実装です。
public static double ConvertTextToNumber(string text)
{
string[] units = new string[] {
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
};
string[] tens = new string[] {"", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"};
string[] scales = new string[] { "hundred", "thousand", "million", "billion", "trillion" };
Dictionary<string, ScaleIncrementPair> numWord = new Dictionary<string, ScaleIncrementPair>();
numWord.Add("and", new ScaleIncrementPair(1, 0));
for (int i = 0; i < units.Length; i++)
{
numWord.Add(units[i], new ScaleIncrementPair(1, i));
}
for (int i = 1; i < tens.Length; i++)
{
numWord.Add(tens[i], new ScaleIncrementPair(1, i * 10));
}
for (int i = 0; i < scales.Length; i++)
{
if(i == 0)
numWord.Add(scales[i], new ScaleIncrementPair(100, 0));
else
numWord.Add(scales[i], new ScaleIncrementPair(Math.Pow(10, (i*3)), 0));
}
double current = 0;
double result = 0;
foreach (var Word in text.Split(new char[] { ' ', '-', '—'}))
{
ScaleIncrementPair scaleIncrement = numWord[Word];
current = current * scaleIncrement.scale + scaleIncrement.increment;
if (scaleIncrement.scale > 100)
{
result += current;
current = 0;
}
}
return result + current;
}
public struct ScaleIncrementPair
{
public double scale;
public int increment;
public ScaleIncrementPair(double s, int i)
{
scale = s;
increment = i;
}
}
簡単なケースアプローチを次に示します。
>>> number = {'one':1,
... 'two':2,
... 'three':3,}
>>>
>>> number['two']
2
それとも、"12万、127"を処理できるものを探していますか?
解析したい数が限られている場合、これは辞書に簡単にハードコードできます。
少し複雑な場合は、比較的単純な数字の文法に基づいて、この辞書を自動的に生成することをお勧めします。これに沿った何か(もちろん、一般化された...)
for i in range(10):
myDict[30 + i] = "thirty-" + singleDigitsDict[i]
より広範なものが必要な場合は、自然言語処理ツールが必要になります。 この記事 は良い出発点かもしれません。
Quick and dirty Java e_hのC#実装のポート(上記)。両方ともintではなくdoubleを返すことに注意してください。
public class Text2Double {
public double Text2Double(String text) {
String[] units = new String[]{
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
};
String[] tens = new String[]{"", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"};
String[] scales = new String[]{"hundred", "thousand", "million", "billion", "trillion"};
Map<String, ScaleIncrementPair> numWord = new LinkedHashMap<>();
numWord.put("and", new ScaleIncrementPair(1, 0));
for (int i = 0; i < units.length; i++) {
numWord.put(units[i], new ScaleIncrementPair(1, i));
}
for (int i = 1; i < tens.length; i++) {
numWord.put(tens[i], new ScaleIncrementPair(1, i * 10));
}
for (int i = 0; i < scales.length; i++) {
if (i == 0)
numWord.put(scales[i], new ScaleIncrementPair(100, 0));
else
numWord.put(scales[i], new ScaleIncrementPair(Math.pow(10, (i * 3)), 0));
}
double current = 0;
double result = 0;
for(String Word : text.split("[ -]"))
{
ScaleIncrementPair scaleIncrement = numWord.get(Word);
current = current * scaleIncrement.scale + scaleIncrement.increment;
if (scaleIncrement.scale > 100) {
result += current;
current = 0;
}
}
return result + current;
}
}
public class ScaleIncrementPair
{
public double scale;
public int increment;
public ScaleIncrementPair(double s, int i)
{
scale = s;
increment = i;
}
}
それを行うMarc Burnsによる Ruby gem があります。私は最近、何年もサポートを追加するためにフォークしました。 pythonからのRubyコード を呼び出すことができます。
require 'numbers_in_words'
require 'numbers_in_words/duck_punch'
nums = ["fifteen sixteen", "eighty five sixteen", "nineteen ninety six",
"one hundred and seventy nine", "thirteen hundred", "nine thousand two hundred and ninety seven"]
nums.each {|n| p n; p n.in_numbers}
結果:"fifteen sixteen" 1516 "eighty five sixteen" 8516 "nineteen ninety six" 1996 "one hundred and seventy nine" 179 "thirteen hundred" 1300 "nine thousand two hundred and ninety seven" 9297
Text2int(scale)が正しい変換を返すように変更しました。たとえば、text2int( "hundred")=> 100。
import re
numwords = {}
def text2int(textnum):
if not numwords:
units = [ "zero", "one", "two", "three", "four", "five", "six",
"seven", "eight", "nine", "ten", "eleven", "twelve",
"thirteen", "fourteen", "fifteen", "sixteen", "seventeen",
"eighteen", "nineteen"]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty",
"seventy", "eighty", "ninety"]
scales = ["hundred", "thousand", "million", "billion", "trillion",
'quadrillion', 'quintillion', 'sexillion', 'septillion',
'octillion', 'nonillion', 'decillion' ]
numwords["and"] = (1, 0)
for idx, Word in enumerate(units): numwords[Word] = (1, idx)
for idx, Word in enumerate(tens): numwords[Word] = (1, idx * 10)
for idx, Word in enumerate(scales): numwords[Word] = (10 ** (idx * 3 or 2), 0)
ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5,
'eighth':8, 'ninth':9, 'twelfth':12}
ordinal_endings = [('ieth', 'y'), ('th', '')]
current = result = 0
tokens = re.split(r"[\s-]+", textnum)
for Word in tokens:
if Word in ordinal_words:
scale, increment = (1, ordinal_words[Word])
else:
for ending, replacement in ordinal_endings:
if Word.endswith(ending):
Word = "%s%s" % (Word[:-len(ending)], replacement)
if Word not in numwords:
raise Exception("Illegal Word: " + Word)
scale, increment = numwords[Word]
if scale > 1:
current = max(1, current)
current = current * scale + increment
if scale > 100:
result += current
current = 0
return result + current
簡単な解決策は、 inflect.py を使用して翻訳用の辞書を生成することです。
inflect.pyにはnumber_to_words()
関数があり、数値(例__2
_)をWord形式(例__'two'
_)に変換します。残念ながら、その逆(翻訳辞書のルートを回避できる)は提供されていません。それでも同じように、その関数を使用して翻訳辞書を作成できます。
_>>> import inflect
>>> p = inflect.engine()
>>> Word_to_number_mapping = {}
>>>
>>> for i in range(1, 100):
... Word_form = p.number_to_words(i) # 1 -> 'one'
... Word_to_number_mapping[Word_form] = i
...
>>> print Word_to_number_mapping['one']
1
>>> print Word_to_number_mapping['eleven']
11
>>> print Word_to_number_mapping['forty-three']
43
_
少しでもコミットしたい場合は、number_to_words()
関数のinflect.pyの内部動作を調べて、これを動的に行うための独自のコードを構築することができます(私はこれをしようとしませんでした) )。