This is getting complicated

author: Tapio Lehtonen <tale@debian.org> 2009-09-05 21:48:04 +0000
committer: Tapio Lehtonen <tale@debian.org> 2009-09-05 21:48:04 +0000
commit: 833b08257c76ed1eeffa2d14f5053564a0e2e346 (patch)
tree: f86b9b1860df92af575fc9841f9695a1cd48ae2c /po
parent: 8e933eec1826c44745a9a6f7fd0dab6815374c24 (diff)
download: installation-guide-833b08257c76ed1eeffa2d14f5053564a0e2e346.zip
2 files changed, 189 insertions, 0 deletions
diff --git a/po/fi/fi.py b/po/fi/fi.py
new file mode 100644
index 000000000..2fe2543dd
--- /dev/null
+++ b/po/fi/fi.py
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+# pyenchant
+#
+# Copyright (C) 2004-2005, Ryan Kelly
+#               2009, Tapio Lehtonen
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the
+# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+# Boston, MA 02111-1307, USA.
+#
+# In addition, as a special exception, you are
+# given permission to link the code of this program with
+# non-LGPL Spelling Provider libraries (eg: a MSFT Office
+# spell checker backend) and distribute linked combinations including
+# the two.  You must obey the GNU Lesser General Public License in all
+# respects for all of the code used other than said providers.  If you modify
+# this file, you may extend this exception to your version of the
+# file, but you are not obligated to do so.  If you do not wish to
+# do so, delete this exception statement from your version.
+#
+"""
+
+    enchant.tokenize.en:    Tokeniser for the Finnish language
+    
+    This module implements a PyEnchant text tokenizer for the Finnish
+    language, based on very simple rules.
+
+"""
+
+import unittest
+import unicodedata
+import locale
+
+import enchant.tokenize
+
+class tokenize(enchant.tokenize.tokenize):
+    """Iterator splitting text into words, reporting position.
+    
+    This iterator takes a text string as input, and yields tuples
+    representing each distinct word found in the text.  The tuples
+    take the form:
+        
+        (<word>,<pos>)
+        
+    Where <word> is the word string found and <pos> is the position
+    of the start of the word within the text.
+    
+    The optional argument <valid_chars> may be used to specify a
+    list of additional characters that can form part of a word.
+    By default, this list contains only the apostrophe ('). Note that
+    these characters cannot appear at the start or end of a word.
+    """
+    
+    def __init__(self,text,valid_chars=(u"'",)):
+        #enchant.tokenize.tokenize.__init__(self)
+        self.loc = locale.getlocale(locale.LC_ALL) # Save current locale
+        locale.setlocale(locale.LC_ALL, (u"fi_FI", u"UTF-8")) # Finnish locale
+        self._valid_chars = valid_chars
+        self._text = text
+        if isinstance(text,unicode):
+            self._myIsAlpha = self._myIsAlpha_u
+        else:
+            self._myIsAlpha = self._myIsAlpha_a
+        self.offset = 0
+
+    #def __del__(self):
+    #    locale.setlocale(locale.LC_ALL, self.loc)
+    #    enchant.tokenize.tokenize.__del__(self)
+    
+    def _myIsAlpha_a(self,c):
+        if c.isalpha() or c in self._valid_chars:
+            return True
+        return False
+
+    def _myIsAlpha_u(self,c):
+        """Extra is-alpha tests for unicode characters.
+        As well as letter characters, treat combining marks as letters.
+        """
+        if c.isalpha():
+            return True
+        if c in self._valid_chars:
+            return True
+        if unicodedata.category(c)[0] == u"M":
+            return True
+        return False
+
+    def next(self):
+        text = self._text
+        offset = self.offset
+        while True:
+            if offset >= len(text):
+                break
+            # Find start of next word (must be alpha)
+            while offset < len(text) and not text[offset].isalpha():
+                offset += 1
+            curPos = offset
+            # Find end of word using myIsAlpha
+            while offset < len(text) and self._myIsAlpha(text[offset]):
+                offset += 1
+            # Return if word isnt empty
+            if(curPos != offset):
+                # Make sure word ends with an alpha
+                #while not text[offset-1].isalpha():
+                #    offset = offset - 1
+                self.offset = offset
+                return (text[curPos:offset],curPos)
+        self.offset = offset
+        raise StopIteration()
+
+
+class TestTokenizeFI(unittest.TestCase):
+    """TestCases for checking behavior of English tokenization."""
+    
+    def test_tokenize_fi(self):
+        """Simple regression test for english tokenization."""
+        inputT = u"""Tämä on kappale. Eipä ole kovin 2 nen, mutta tarkoitus on näyttää miten sanastaja 
+toimii useiden-erilaisten sanaryppäiden kimpussa.
+Pitääpä vielä "tarkistaa" sanat jotka 'lainausmerkeissä."""
+        outputT = [
+                  (u"Tämä",0),(u"on",5),(u"kappale",8),(u"Eipä",17),(u"ole",22),
+                  (u"kovin",26),(u"nen",34),(u"mutta",39),(u"tarkoitus",45),
+                  (u"on",55),(u"näyttää",58),(u"miten",66),(u"sanastaja",72),
+                  (u"toimii",83),(u"useiden",90),(u"erilaisten",98),(u"sanaryppäiden",109),
+                  (u"kimpussa",123),(u"Pitääpä",133),(u"vielä",141),(u"tarkistaa",148),
+                  (u"sanat",159),(u"jotka", 165),(u"lainausmerkeissä",172)
+                 ]
+        for (itmO,itmV) in zip(outputT,tokenize(inputT)):
+            self.assertEqual(itmO,itmV)
+
+    def test_bug1591450(self):
+        """Check for tokenization regressions identified in bug #1591450."""
+	inputT = """Testing <i>markup</i> and {y:i}so-forth...leading dots and trail--- well, you get-the-point. Also check numbers: 999 1,000 12:00 .45. Done?"""
+        outputT = [
+                  ("Testing",0),("i",9),("markup",11),("i",19),("and",22),
+                  ("y",27),("i",29),("so",31),("forth",34),("leading",42),
+                  ("dots",50),("and",55),("trail",59),("well",68),
+                  ("you",74),("get",78),("the",82),("point",86),
+                  ("Also",93),("check",98),("numbers",104),("Done",134),
+                 ]
+        for (itmO,itmV) in zip(outputT,tokenize(inputT)):
+            self.assertEqual(itmO,itmV)
+
+    def test_unicodeBasic(self):
+        """Test tokenization of a basic unicode string."""
+        inputT = u"Ik ben ge\u00EFnteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet \u00E9\u00E9n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao"
+        outputT = inputT.split(" ")
+        outputT[8] = outputT[8][0:-1]
+        for (itmO,itmV) in zip(outputT,tokenize(inputT)):
+            self.assertEqual(itmO,itmV[0])
+            self.assert_(inputT[itmV[1]:].startswith(itmO))
+
+    def test_unicodeCombining(self):
+        """Test tokenization with unicode combining symbols."""
+        inputT = u"Ik ben gei\u0308nteresseerd in de co\u00F6rdinatie van mijn knie\u00EBn, maar kan niet e\u0301e\u0301n \u00E0 twee enqu\u00EAtes vinden die recht doet aan mijn carri\u00E8re op Cura\u00E7ao"
+        outputT = inputT.split(" ")
+        outputT[8] = outputT[8][0:-1]
+        for (itmO,itmV) in zip(outputT,tokenize(inputT)):
+            self.assertEqual(itmO,itmV[0])
+            self.assert_(inputT[itmV[1]:].startswith(itmO))
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/po/fi/test-fi.py b/po/fi/test-fi.py
new file mode 100755
index 000000000..7de902906
--- /dev/null
+++ b/po/fi/test-fi.py
@@ -0,0 +1,16 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import fi
+import unittest
+
+tzer = fi.tokenize(u"""Tämä on kappale. Eipä ole kovin 2 nen, mutta tarkoitus on näyttää miten sanastaja 
+toimii useiden-erilaisten sanaryppäiden kimpussa.
+Pitääpä vielä "tarkistaa" sanat jotka 'lainausmerkeissä.""",
+                   valid_chars=u"'")
+for w in tzer:
+    print w
+
+#koe = fi.TestTokenizeFI()
+#koe.test_tokenize_fi()
+
author	Tapio Lehtonen <tale@debian.org>	2009-09-05 21:48:04 +0000
committer	Tapio Lehtonen <tale@debian.org>	2009-09-05 21:48:04 +0000
commit	833b08257c76ed1eeffa2d14f5053564a0e2e346 (patch)
tree	f86b9b1860df92af575fc9841f9695a1cd48ae2c /po
parent	8e933eec1826c44745a9a6f7fd0dab6815374c24 (diff)
download	installation-guide-833b08257c76ed1eeffa2d14f5053564a0e2e346.zip