annotate data_generation/transformations/pycaptcha/Captcha/Words.py @ 595:da46a62ce402

submitted JMLR pdf
author Yoshua Bengio <bengioy@iro.umontreal.ca>
date Tue, 05 Oct 2010 15:07:33 -0400
parents 1f5937e9e530
children
rev   line source
87
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
1 """ Captcha.Words
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
2
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
3 Utilities for managing word lists and finding random words
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
4 """
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
5 #
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
6 # PyCAPTCHA Package
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
7 # Copyright (C) 2004 Micah Dowty <micah@navi.cx>
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
8 #
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
9
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
10 import random, os
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
11 import File
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
12
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
13
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
14 class WordList(object):
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
15 """A class representing a word list read from disk lazily.
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
16 Blank lines and comment lines starting with '#' are ignored.
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
17 Any number of words per line may be used. The list can
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
18 optionally ingore words not within a given length range.
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
19 """
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
20 def __init__(self, fileName, minLength=None, maxLength=None):
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
21 self.words = None
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
22 self.fileName = fileName
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
23 self.minLength = minLength
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
24 self.maxLength = maxLength
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
25
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
26 def read(self):
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
27 """Read words from disk"""
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
28 f = open(os.path.join(File.dataDir, "words", self.fileName))
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
29
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
30 self.words = []
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
31 for line in f.xreadlines():
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
32 line = line.strip()
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
33 if not line:
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
34 continue
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
35 if line[0] == '#':
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
36 continue
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
37 for word in line.split():
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
38 if self.minLength is not None and len(word) < self.minLength:
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
39 continue
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
40 if self.maxLength is not None and len(word) > self.maxLength:
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
41 continue
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
42 self.words.append(word)
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
43
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
44 def pick(self):
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
45 """Pick a random word from the list, reading it in if necessary"""
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
46 if self.words is None:
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
47 self.read()
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
48 return random.choice(self.words)
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
49
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
50
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
51 # Define several shared word lists that are read from disk on demand
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
52 basic_english = WordList("basic-english")
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
53 basic_english_restricted = WordList("basic-english", minLength=5, maxLength=8)
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
54 characters = WordList("characters")
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
55 defaultWordList = characters
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
56
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
57
4775b4195b4b code pour la generation de captchas
goldfinger
parents:
diff changeset
58 ### The End ###