summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTapio Lehtonen <tale@debian.org>2009-09-10 05:13:23 +0000
committerTapio Lehtonen <tale@debian.org>2009-09-10 05:13:23 +0000
commit3d7d1cbaa6e50c96a7a45ce1a6e4dfe86fbb5379 (patch)
tree81d50164fd72151b6f443c8274a7b6b264a18ff2
parent5deb5b41aa86a0fdbf9969fb0890b967834362b3 (diff)
downloadinstallation-guide-3d7d1cbaa6e50c96a7a45ce1a6e4dfe86fbb5379.zip
Tokenizer is OK as it is, no need to modify it.
-rw-r--r--po/fi/fi.py186
1 files changed, 0 insertions, 186 deletions
diff --git a/po/fi/fi.py b/po/fi/fi.py
deleted file mode 100644
index 0daf3b58b..000000000
--- a/po/fi/fi.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# -*- coding: utf-8 -*-
-# pyenchant
-#
-# Copyright (C) 2004-2005, Ryan Kelly
-# 2009, Tapio Lehtonen
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2.1 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the
-# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-# Boston, MA 02111-1307, USA.
-#
-# In addition, as a special exception, you are
-# given permission to link the code of this program with
-# non-LGPL Spelling Provider libraries (eg: a MSFT Office
-# spell checker backend) and distribute linked combinations including
-# the two. You must obey the GNU Lesser General Public License in all
-# respects for all of the code used other than said providers. If you modify
-# this file, you may extend this exception to your version of the
-# file, but you are not obligated to do so. If you do not wish to
-# do so, delete this exception statement from your version.
-#
-"""
-
- enchant.tokenize.fi: Tokeniser for the Finnish language
-
- This module implements a PyEnchant text tokenizer for the Finnish
- language, based on very simple rules.
-
-"""
-
-import unittest
-import unicodedata
-import locale
-
-import enchant.tokenize
-
-class tokenize(enchant.tokenize.tokenize):
- """Iterator splitting text into words, reporting position.
-
- This iterator takes a text string as input, and yields tuples
- representing each distinct word found in the text. The tuples
- take the form:
-
- (<word>,<pos>)
-
- Where <word> is the word string found and <pos> is the position
- of the start of the word within the text.
-
- The optional argument <valid_chars> may be used to specify a
- list of additional characters that can form part of a word.
- By default, this list contains only the apostrophe ('). Note that
- these characters cannot appear at the start or end of a word.
- """
-
- def __init__(self,text,valid_chars=(u"'",)):
- #enchant.tokenize.tokenize.__init__(self)
- #LC_ALL not allowed as category for getlocale
- self.savedloc = locale.getlocale(locale.LC_CTYPE) # Save current locale
- # Python Library Reference version 2.6 chapter 23.2.1 states
- # locale should not be changed. But I can not see how
- # isalpha works for Finnish without using locale fi_FI.
- # Either change locale or write my own isalpha function
- # for this class here. Adding only åäöÅÄÖ to valid_chars
- # is not enough with Unicode, then tokenizer would barf
- # on admittedly not finnish words but valid words
- # nontheless, for example: süsses, spaß,
- locale.setlocale(locale.LC_ALL, (u"fi_FI", u"UTF-8")) # Finnish locale
- self._valid_chars = valid_chars
- self._text = text
- if isinstance(text,unicode):
- self._myIsAlpha = self._myIsAlpha_u
- else:
- self._myIsAlpha = self._myIsAlpha_a
- self.offset = 0
-
- def __del__(self):
- locale.setlocale(locale.LC_ALL, self.savedloc)
- # enchant.tokenize.tokenize.__del__(self)
-
- def _myIsAlpha_a(self,c):
- if c.isalpha() or c in self._valid_chars:
- return True
- return False
-
- def _myIsAlpha_u(self,c):
- """Extra is-alpha tests for unicode characters.
- As well as letter characters, treat combining marks as letters.
- """
- if c.isalpha():
- return True
- if c in self._valid_chars:
- return True
- if unicodedata.category(c)[0] == u"M":
- return True
- return False
-
- def next(self):
- text = self._text
- offset = self.offset
- while True:
- if offset >= len(text):
- break
- # Find start of next word (must be alpha)
- while offset < len(text) and not text[offset].isalpha():
- offset += 1
- curPos = offset
- # Find end of word using myIsAlpha
- while offset < len(text) and self._myIsAlpha(text[offset]):
- offset += 1
- # Return if word isnt empty
- if(curPos != offset):
- # Make sure word ends with an alpha
- #while not text[offset-1].isalpha():
- # offset = offset - 1
- self.offset = offset
- return (text[curPos:offset],curPos)
- self.offset = offset
- raise StopIteration()
-
-
-class TestTokenizeFI(unittest.TestCase):
- """TestCases for checking behavior of Finnish tokenization."""
-
- def test_tokenize_fi(self):
- """Simple regression test for finnish tokenization."""
- inputT = u"""Tämä on kappale. Eipä ole kovin 2 nen, mutta tarkoitus on näyttää miten sanastaja
-toimii useiden-erilaisten sanaryppäiden kimpussa.
-Pitääpä vielä "tarkistaa" sanat jotka "lainausmerkeissä". Heittomerkki ja vaa'an.
-Ulkomaisia sanoja süss, spaß."""
- outputT = [
- (u"Tämä",0),(u"on",5),(u"kappale",8),(u"Eipä",17),(u"ole",22),
- (u"kovin",26),(u"nen",34),(u"mutta",39),(u"tarkoitus",45),
- (u"on",55),(u"näyttää",58),(u"miten",66),(u"sanastaja",72),
- (u"toimii",83),(u"useiden",90),(u"erilaisten",98),(u"sanaryppäiden",109),
- (u"kimpussa",123),(u"Pitääpä",133),(u"vielä",141),(u"tarkistaa",148),
- (u"sanat",159),(u"jotka", 165),(u"lainausmerkeissä",172),
- (u"Heittomerkki", 191), (u"ja", 204), (u"vaa'an", 207),
- (u"Ulkomaisia", 215), (u"sanoja", 226), (u"süss", 233),
- (u"spaß", 239)
- ]
- for (itmO,itmV) in zip(outputT,tokenize(inputT)):
- self.assertEqual(itmO,itmV)
-
- def test_bug1591450(self):
- """Check for tokenization regressions identified in bug #1591450."""
- inputT = """Testing <i>markup</i> and {y:i}so-forth...leading dots and trail--- well, you get-the-point. Also check numbers: 999 1,000 12:00 .45. Done?"""
- outputT = [
- ("Testing",0),("i",9),("markup",11),("i",19),("and",22),
- ("y",27),("i",29),("so",31),("forth",34),("leading",42),
- ("dots",50),("and",55),("trail",59),("well",68),
- ("you",74),("get",78),("the",82),("point",86),
- ("Also",93),("check",98),("numbers",104),("Done",134),
- ]
- for (itmO,itmV) in zip(outputT,tokenize(inputT)):
- self.assertEqual(itmO,itmV)
-
- def test_unicodeBasic(self):
- """Test tokenization of a basic unicode string."""
- inputT = u"Ik ben ge\u00EFnteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet \u00E9\u00E9n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao"
- outputT = inputT.split(" ")
- outputT[8] = outputT[8][0:-1]
- for (itmO,itmV) in zip(outputT,tokenize(inputT)):
- self.assertEqual(itmO,itmV[0])
- self.assert_(inputT[itmV[1]:].startswith(itmO))
-
- def test_unicodeCombining(self):
- """Test tokenization with unicode combining symbols."""
- inputT = u"Ik ben gei\u0308nteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet e\u0301e\u0301n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao"
- outputT = inputT.split(" ")
- outputT[8] = outputT[8][0:-1]
- for (itmO,itmV) in zip(outputT,tokenize(inputT)):
- self.assertEqual(itmO,itmV[0])
- self.assert_(inputT[itmV[1]:].startswith(itmO))
-
-if __name__ == "__main__":
- unittest.main()