wiki:Python

Context Navigation

Version 16 (modified by atzm, 17 years ago) (diff)
--

Python

Python

メモ．主に日記からの転載．

行番号を得る

Perl で言うところの __LINE__ みたいなもの．スタックフレームを参照すれば良い．

import inspect

def lineno():
    return inspect.currentframe().f_back.f_lineno

if __name__ == '__main__':
    print lineno()

可変長引数

>>> def hoge(*a):
...     print a
>>> hoge(1, 2, 3)
(1, 2, 3)

>>> def hage(a, b, c):
...     print a, b, c
>>> a = (1, 2, 3)
>>> hage(*a)
1 2 3

>>> def hige(**a):
...     print a
>>> hige(a=1, b=2, c=3)
{'a': 1, 'c': 3, 'b': 2}

>>> def huge(a, b, c):
...     print a, b, c
>>> a = {'a': 1, 'b': 2, 'c': 3}
>>> huge(**a)
1 2 3

hidden read-ahead buffer

Built-in Types -- Python v2.6.1 documentation (#file.next) より

In order to make a for loop the most efficient way of looping
over the lines of a file (a very common operation),
the next() method uses a hidden read-ahead buffer.

next() を見る

コード

def printrss():
    pf = open('/proc/self/status')
    for l in pf:
        if l[:5] == 'VmRSS':
            print l,
    pf.close()

def test(fname='/path/to/huge_file'):
    f = open(fname)
    c = 1
    for i in f:
        print '===== %d =====' % c
        printrss()
        c += 1
    f.close()

print 'test1'
test()

結果

  :     :     :
===== 1 =====
VmRSS:      2392 kB
  :     :     :
===== 11212 =====
VmRSS:      2408 kB

ちょっと (16KB) だけ増えてる

readline() を見る

コード

def printrss():
    pf = open('/proc/self/status')
    for l in pf:
        if l[:5] == 'VmRSS':
            print l,
    pf.close()

def test(fname='/path/to/huge_file'):
    f = open(fname)
    c = 1
    while True:
        i = f.readline()
        if not i: break
        print '===== %d =====' % c
        printrss()
        c += 1
    f.close()

print 'test1'
test()

結果

  :     :     :
===== 1 =====
VmRSS:      2396 kB
  :     :     :
===== 11212 =====
VmRSS:      2396 kB

増えてない

日本語関連

文字コードとか

例えば euc-jp の場合

>>> a = 'あ'
>>> ord(a[0]) == 0xa4
True
>>> ord(a[1]) == 0xa2
True

>>> ''.join([hex(ord(c))[2:] for c in 'あいうえお'])
'a4a2a4a4a4a6a4a8a4aa'

こんなこともできる

>>> 'あいうえお'.encode('hex')
'a4a2a4a4a4a6a4a8a4aa'

詳細は pydoc encodings

行列の行と列を入れ替えるワンライナー

$ cat hoge.txt 
1 2 3 4
5 6 7 8
9 10 11 12

$ python -c 'import sys; print "\n".join([" ".join(i) for i in zip(*[i.split() for i in sys.stdin])])' < hoge.txt
1 5 9
2 6 10
3 7 11
4 8 12

キモは zip(*list)

staticmethod

デコレータ関数なので以下のように使う

>>> class Hoge:
...     @staticmethod
...     def hoge(*args):
...             print ', '.join([str(i) for i in args])
... 
>>> Hoge.hoge(1, '2', 3.3, None, False, Hoge)
1, 2, 3.3, None, False, __main__.Hoge

別に以下のようにしても問題はない

>>> class Hoge:
...     def hoge(*args):
...         print ', '.join([str(i) for i in args])
...     hoge = staticmethod(hoge)
...
>>> Hoge.hoge(1, '2', 3.3, None, False, Hoge)
1, 2, 3.3, None, False, __main__.Hoge

raw_input 中に SIGALRM 出すと EOFError

import signal
signal.signal(signal.SIGALRM, lambda *a: None)
signal.alarm(5)
raw_input()

5 秒待つと EOFError．
何とか回避できんもんかなぁ．

HostIP を使う

import urllib
import mimetools

class HostIP(dict):
    _URL_BASE = 'http://api.hostip.info/rough.php?position=true&ip=%s'
    _GOOGLEMAPS_BASE = 'http://maps.google.com/?q=%sN+%sE(%s)'

    def __init__(self, ipaddr):
        url = self._URL_BASE % ipaddr

        fp = urllib.urlopen(url)
        headers = mimetools.Message(fp, 0)
        fp.close()

        dict.__init__(self, headers.dict)
        self['ipaddr'] = ipaddr
        self['url'] = url
        self['googlemaps'] = self._GOOGLEMAPS_BASE % (self['latitude'], self['longitude'], ipaddr)

dict として使える HostIP オブジェクト．

>>> test = HostIP('210.156.41.55')
>>> for k, v in test.items():
...     print '%s: %s' % (k, v)
... 
city: Morioka
guessed: true
url: http://api.hostip.info/rough.php?position=true&ip=210.156.41.55
googlemaps: http://maps.google.com/?q=39.7N+141.15E(210.156.41.55)
latitude: 39.7
country: JAPAN
ipaddr: 210.156.41.55
country code: JP
longitude: 141.15

dict 同士のマージ

dict.update() だと同じキーを持つ値が上書きされてしまうので，上書きせずに足し合わせたりできる関数が欲しかった．
ただし，第一引数について破壊的，末端の value が数値以外に色々混ざってる時にどうなるかは知らん，という欠点あり．

def merge(my_dict, new_dict, mergetype='add'):
    for k, v in new_dict.iteritems():
        try:
            if isinstance(v, dict):
                merge(my_dict[k], v, mergetype)
            else:
                my_dict[k] = getattr(my_dict[k], '__%s__' % mergetype)(v)
        except KeyError:
            my_dict[k] = v
    return my_dict

リストから重複を取り除く

ただし順番は勝手にソートされる．

try:
    set, frozenset
except NameError:
    from sets import Set as set, ImmutableSet as frozenset

def uniq(sequence):
    return list(set(sequence))

Python でマルコフ連鎖

参考： MeCabとPythonでマルコフ連鎖を書いてみる(改)

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import random
import MeCab
import feedparser

def wakati(text):
    t = MeCab.Tagger("-Owakati")
    m = t.parse(text)
    result = m.rstrip(" \n").split(" ")
    return result

def create_markov_table(wordlist, level=2):
    def check_word(words):
        return len([w for w in words if w]) == len(words)

    markov = {}
    tmpword = ["" for i in range(level)]

    for word in wordlist:
        if check_word(tmpword):
            key = tuple(tmpword)
            try:
                markov[key].append(word)
            except KeyError:
                markov[key] = [word]
        tmpword = tmpword[1:] + [word]

    return markov

def gen_sentence(markov, level=2):
    count = 0
    sentence = []

    tmpword = random.choice(markov.keys())
    while count < len(wordlist):
        try:
            tmp = random.choice(markov[tmpword])
            sentence.append(tmp)
            tmpword = tuple(list(tmpword)[1:] + [tmp])
        except KeyError:
            pass
        count += 1

    return ''.join(sentence)

def cnet_news():
    f = feedparser.parse("http://feed.japan.cnet.com/rss/index.rdf")
    text = "\n".join([s["summary"].split("\n")[0] for s in f.entries])
    return text.encode("utf-8")

if __name__ == "__main__":
    level = 4
    src = cnet_news()
    wordlist = wakati(src)
    markov = create_markov_table(wordlist, level)
    sentence = gen_sentence(markov, level)
    print unicode(sentence, "utf-8")

Download in other formats:

Plain Text