001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.math3.distribution; 019 020import org.apache.commons.math3.exception.NotStrictlyPositiveException; 021import org.apache.commons.math3.exception.util.LocalizedFormats; 022import org.apache.commons.math3.random.RandomGenerator; 023import org.apache.commons.math3.random.Well19937c; 024import org.apache.commons.math3.util.FastMath; 025 026/** 027 * Implementation of the Zipf distribution. 028 * 029 * @see <a href="http://mathworld.wolfram.com/ZipfDistribution.html">Zipf distribution (MathWorld)</a> 030 */ 031public class ZipfDistribution extends AbstractIntegerDistribution { 032 /** Serializable version identifier. */ 033 private static final long serialVersionUID = -140627372283420404L; 034 /** Number of elements. */ 035 private final int numberOfElements; 036 /** Exponent parameter of the distribution. */ 037 private final double exponent; 038 /** Cached numerical mean */ 039 private double numericalMean = Double.NaN; 040 /** Whether or not the numerical mean has been calculated */ 041 private boolean numericalMeanIsCalculated = false; 042 /** Cached numerical variance */ 043 private double numericalVariance = Double.NaN; 044 /** Whether or not the numerical variance has been calculated */ 045 private boolean numericalVarianceIsCalculated = false; 046 047 /** 048 * Create a new Zipf distribution with the given number of elements and 049 * exponent. 050 * <p> 051 * <b>Note:</b> this constructor will implicitly create an instance of 052 * {@link Well19937c} as random generator to be used for sampling only (see 053 * {@link #sample()} and {@link #sample(int)}). In case no sampling is 054 * needed for the created distribution, it is advised to pass {@code null} 055 * as random generator via the appropriate constructors to avoid the 056 * additional initialisation overhead. 057 * 058 * @param numberOfElements Number of elements. 059 * @param exponent Exponent. 060 * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0} 061 * or {@code exponent <= 0}. 062 */ 063 public ZipfDistribution(final int numberOfElements, final double exponent) { 064 this(new Well19937c(), numberOfElements, exponent); 065 } 066 067 /** 068 * Creates a Zipf distribution. 069 * 070 * @param rng Random number generator. 071 * @param numberOfElements Number of elements. 072 * @param exponent Exponent. 073 * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0} 074 * or {@code exponent <= 0}. 075 * @since 3.1 076 */ 077 public ZipfDistribution(RandomGenerator rng, 078 int numberOfElements, 079 double exponent) 080 throws NotStrictlyPositiveException { 081 super(rng); 082 083 if (numberOfElements <= 0) { 084 throw new NotStrictlyPositiveException(LocalizedFormats.DIMENSION, 085 numberOfElements); 086 } 087 if (exponent <= 0) { 088 throw new NotStrictlyPositiveException(LocalizedFormats.EXPONENT, 089 exponent); 090 } 091 092 this.numberOfElements = numberOfElements; 093 this.exponent = exponent; 094 } 095 096 /** 097 * Get the number of elements (e.g. corpus size) for the distribution. 098 * 099 * @return the number of elements 100 */ 101 public int getNumberOfElements() { 102 return numberOfElements; 103 } 104 105 /** 106 * Get the exponent characterizing the distribution. 107 * 108 * @return the exponent 109 */ 110 public double getExponent() { 111 return exponent; 112 } 113 114 /** {@inheritDoc} */ 115 public double probability(final int x) { 116 if (x <= 0 || x > numberOfElements) { 117 return 0.0; 118 } 119 120 return (1.0 / FastMath.pow(x, exponent)) / generalizedHarmonic(numberOfElements, exponent); 121 } 122 123 /** {@inheritDoc} */ 124 @Override 125 public double logProbability(int x) { 126 if (x <= 0 || x > numberOfElements) { 127 return Double.NEGATIVE_INFINITY; 128 } 129 130 return -FastMath.log(x) * exponent - FastMath.log(generalizedHarmonic(numberOfElements, exponent)); 131 } 132 133 /** {@inheritDoc} */ 134 public double cumulativeProbability(final int x) { 135 if (x <= 0) { 136 return 0.0; 137 } else if (x >= numberOfElements) { 138 return 1.0; 139 } 140 141 return generalizedHarmonic(x, exponent) / generalizedHarmonic(numberOfElements, exponent); 142 } 143 144 /** 145 * {@inheritDoc} 146 * 147 * For number of elements {@code N} and exponent {@code s}, the mean is 148 * {@code Hs1 / Hs}, where 149 * <ul> 150 * <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li> 151 * <li>{@code Hs = generalizedHarmonic(N, s)}.</li> 152 * </ul> 153 */ 154 public double getNumericalMean() { 155 if (!numericalMeanIsCalculated) { 156 numericalMean = calculateNumericalMean(); 157 numericalMeanIsCalculated = true; 158 } 159 return numericalMean; 160 } 161 162 /** 163 * Used by {@link #getNumericalMean()}. 164 * 165 * @return the mean of this distribution 166 */ 167 protected double calculateNumericalMean() { 168 final int N = getNumberOfElements(); 169 final double s = getExponent(); 170 171 final double Hs1 = generalizedHarmonic(N, s - 1); 172 final double Hs = generalizedHarmonic(N, s); 173 174 return Hs1 / Hs; 175 } 176 177 /** 178 * {@inheritDoc} 179 * 180 * For number of elements {@code N} and exponent {@code s}, the mean is 181 * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where 182 * <ul> 183 * <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li> 184 * <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li> 185 * <li>{@code Hs = generalizedHarmonic(N, s)}.</li> 186 * </ul> 187 */ 188 public double getNumericalVariance() { 189 if (!numericalVarianceIsCalculated) { 190 numericalVariance = calculateNumericalVariance(); 191 numericalVarianceIsCalculated = true; 192 } 193 return numericalVariance; 194 } 195 196 /** 197 * Used by {@link #getNumericalVariance()}. 198 * 199 * @return the variance of this distribution 200 */ 201 protected double calculateNumericalVariance() { 202 final int N = getNumberOfElements(); 203 final double s = getExponent(); 204 205 final double Hs2 = generalizedHarmonic(N, s - 2); 206 final double Hs1 = generalizedHarmonic(N, s - 1); 207 final double Hs = generalizedHarmonic(N, s); 208 209 return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs)); 210 } 211 212 /** 213 * Calculates the Nth generalized harmonic number. See 214 * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic 215 * Series</a>. 216 * 217 * @param n Term in the series to calculate (must be larger than 1) 218 * @param m Exponent (special case {@code m = 1} is the harmonic series). 219 * @return the n<sup>th</sup> generalized harmonic number. 220 */ 221 private double generalizedHarmonic(final int n, final double m) { 222 double value = 0; 223 for (int k = n; k > 0; --k) { 224 value += 1.0 / FastMath.pow(k, m); 225 } 226 return value; 227 } 228 229 /** 230 * {@inheritDoc} 231 * 232 * The lower bound of the support is always 1 no matter the parameters. 233 * 234 * @return lower bound of the support (always 1) 235 */ 236 public int getSupportLowerBound() { 237 return 1; 238 } 239 240 /** 241 * {@inheritDoc} 242 * 243 * The upper bound of the support is the number of elements. 244 * 245 * @return upper bound of the support 246 */ 247 public int getSupportUpperBound() { 248 return getNumberOfElements(); 249 } 250 251 /** 252 * {@inheritDoc} 253 * 254 * The support of this distribution is connected. 255 * 256 * @return {@code true} 257 */ 258 public boolean isSupportConnected() { 259 return true; 260 } 261} 262