view data_generation/transformations/pycaptcha/Captcha/Words.py @ 647:47af8a002530 tip

changed Theano to ift6266 and remove numpy as we do not use code from numpy in this repository
author Razvan Pascanu <r.pascanu@gmail.com>
date Wed, 17 Oct 2012 09:26:14 -0400
parents 1f5937e9e530
children
line wrap: on
line source

""" Captcha.Words

Utilities for managing word lists and finding random words
"""
#
# PyCAPTCHA Package
# Copyright (C) 2004 Micah Dowty <micah@navi.cx>
#

import random, os
import File


class WordList(object):
    """A class representing a word list read from disk lazily.
       Blank lines and comment lines starting with '#' are ignored.
       Any number of words per line may be used. The list can
       optionally ingore words not within a given length range.
       """
    def __init__(self, fileName, minLength=None, maxLength=None):
        self.words = None
        self.fileName = fileName
        self.minLength = minLength
        self.maxLength = maxLength

    def read(self):
        """Read words from disk"""
        f = open(os.path.join(File.dataDir, "words", self.fileName))

        self.words = []
        for line in f.xreadlines():
            line = line.strip()
            if not line:
                continue
            if line[0] == '#':
                continue
            for word in line.split():
                if self.minLength is not None and len(word) < self.minLength:
                    continue
                if self.maxLength is not None and len(word) > self.maxLength:
                    continue
                self.words.append(word)

    def pick(self):
        """Pick a random word from the list, reading it in if necessary"""
        if self.words is None:
            self.read()
        return random.choice(self.words)


# Define several shared word lists that are read from disk on demand
basic_english            = WordList("basic-english")
basic_english_restricted = WordList("basic-english", minLength=5, maxLength=8)
characters = WordList("characters")
defaultWordList = characters


### The End ###