#!/usr/bin/python

## Licensed under the Apache License, Version 2.0 (the "License");
## you may not use this file except in compliance with the License.
## You may obtain a copy of the License at
##
##      http://www.apache.org/licenses/LICENSE-2.0
##
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.

"""Replicates Rao et al's "proof" that the Indus symbols
represent a writing system. With a pure Zipfian model with alpha=1.5,
and complete conditional independence, you observe a very similar
growth of conditional entropy to what they observe for the Indus
corpus.

Outputs code that can be fed into R to produce the plot.
"""

__author__ = """
rws@xoba.com (Richard Sproat)
"""


from math import log


def MakePerfectZipf(n, alpha=1.0):
  alpha = float(alpha)
  unigrams = []
  total = 0
  for i in range(1, n + 1):
    p = 1.0/(i ** alpha)
    unigrams.append(p)
    total += p
  for i in range(0, n):
    unigrams[i] = unigrams[i]/total
  return unigrams


def MakeIndependentJointMatrix(unigrams):
  joints = {}
  for i in range(len(unigrams)):
    for j in range(len(unigrams)):
      joints[i, j] = unigrams[i] * unigrams[j]
  return joints


def ComputeEntropy(joints, unigrams, n):
  ent = 0
  for x in range(n):
    for y in range(n):
      try:
        ent -= joints[x, y] * log(joints[x, y] / unigrams[x])
      except KeyError:
        pass
  return ent


def ComputeUnigramEntropy(unigrams, n):
  ent = 0
  for x in range(n):
    ent -= unigrams[x] * log(unigrams[x])
  return ent


def SimpleIndependent(n, alpha):
  unigrams = MakePerfectZipf(n, alpha)
  joints = MakeIndependentJointMatrix(unigrams)
  ents = []
  for i in range(20, n, 20):
    ents.append(ComputeEntropy(joints, unigrams, i))
  uents = []
  for i in range(20, n, 20):
    uents.append(ComputeUnigramEntropy(unigrams, i))
  return ents, uents


def SimpleIndependentFromFile(file):
  unigrams = []
  p = open(file)
  for line in p:
    unigrams.append(float(line.strip()))
  n = len(unigrams)
  joints = MakeIndependentJointMatrix(unigrams)
  ents = []
  for i in range(20, n, 20):
    ents.append(ComputeEntropy(joints, unigrams, i))
  uents = []
  for i in range(20, n, 20):
    uents.append(ComputeUnigramEntropy(unigrams, i))
  return ents, uents


def ComputeFromRealData(unifile, bifile, maxcorp=-1):
  unigrams = []
  idx = {}
  p = open(unifile)
  i = 0
  for line in p:
    line = line.strip().split()
    prob = float(line[0])
    uni = line[1]
    unigrams.append(prob)
    idx[uni] = i
    i += 1
  p.close()
  joints = {}
  p = open(bifile)
  for line in p:
    line = line.strip().split()
    prob = float(line[0])
    bi = line[1]
    bi = bi.split(',')
    try: joints[idx[bi[0]], idx[bi[1]]] = prob
    except KeyError: pass
  p.close()
  if maxcorp > 0: unigrams = unigrams[:maxcorp]
  n = len(unigrams)
  ents = []
  for i in range(20, n, 20):
    ents.append(ComputeEntropy(joints, unigrams, i))
  uents = []
  for i in range(20, n, 20):
    uents.append(ComputeUnigramEntropy(unigrams, i))
  return ents, uents


ZIPF_MAIN_ = "Perfect Zipf model, alpha=%4.2f, Complete Independence"

def MakeRPlot(ents, alpha=-1, pch=5):
  main=""
  if alpha != -1: 
    main= ZIPF_MAIN_ = alpha
  steps = []
  for i in range(len(ents)):
    steps.append(i*20 + 20)
  print 'steps = c(%s)' % ','.join(map(lambda x: str(x), steps))
  print 'ents = c(%s)' % ','.join(map(lambda x: str(x), ents))
  print 'plot(steps, ents, xlab="Number of Tokens",\
 ylim=c(0,4), xlim=c(0,400), main="%s",\
 ylab="Conditional Entropy", type="b", pch=%d, col=%d)' % (main, pch, pch)
  print 'par(new=TRUE)'
  #print 'q()'


if __name__ == '__main__':
  alpha = 1.4
  ents, uents = SimpleIndependent(400, alpha)
  MakeRPlot(ents, alpha)
