ZipfDistribution.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      http://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.math3.distribution;

  18. import org.apache.commons.math3.exception.NotStrictlyPositiveException;
  19. import org.apache.commons.math3.exception.util.LocalizedFormats;
  20. import org.apache.commons.math3.random.RandomGenerator;
  21. import org.apache.commons.math3.random.Well19937c;
  22. import org.apache.commons.math3.util.FastMath;

  23. /**
  24.  * Implementation of the Zipf distribution.
  25.  *
  26.  * @see <a href="http://mathworld.wolfram.com/ZipfDistribution.html">Zipf distribution (MathWorld)</a>
  27.  */
  28. public class ZipfDistribution extends AbstractIntegerDistribution {
  29.     /** Serializable version identifier. */
  30.     private static final long serialVersionUID = -140627372283420404L;
  31.     /** Number of elements. */
  32.     private final int numberOfElements;
  33.     /** Exponent parameter of the distribution. */
  34.     private final double exponent;
  35.     /** Cached numerical mean */
  36.     private double numericalMean = Double.NaN;
  37.     /** Whether or not the numerical mean has been calculated */
  38.     private boolean numericalMeanIsCalculated = false;
  39.     /** Cached numerical variance */
  40.     private double numericalVariance = Double.NaN;
  41.     /** Whether or not the numerical variance has been calculated */
  42.     private boolean numericalVarianceIsCalculated = false;

  43.     /**
  44.      * Create a new Zipf distribution with the given number of elements and
  45.      * exponent.
  46.      * <p>
  47.      * <b>Note:</b> this constructor will implicitly create an instance of
  48.      * {@link Well19937c} as random generator to be used for sampling only (see
  49.      * {@link #sample()} and {@link #sample(int)}). In case no sampling is
  50.      * needed for the created distribution, it is advised to pass {@code null}
  51.      * as random generator via the appropriate constructors to avoid the
  52.      * additional initialisation overhead.
  53.      *
  54.      * @param numberOfElements Number of elements.
  55.      * @param exponent Exponent.
  56.      * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
  57.      * or {@code exponent <= 0}.
  58.      */
  59.     public ZipfDistribution(final int numberOfElements, final double exponent) {
  60.         this(new Well19937c(), numberOfElements, exponent);
  61.     }

  62.     /**
  63.      * Creates a Zipf distribution.
  64.      *
  65.      * @param rng Random number generator.
  66.      * @param numberOfElements Number of elements.
  67.      * @param exponent Exponent.
  68.      * @exception NotStrictlyPositiveException if {@code numberOfElements <= 0}
  69.      * or {@code exponent <= 0}.
  70.      * @since 3.1
  71.      */
  72.     public ZipfDistribution(RandomGenerator rng,
  73.                             int numberOfElements,
  74.                             double exponent)
  75.         throws NotStrictlyPositiveException {
  76.         super(rng);

  77.         if (numberOfElements <= 0) {
  78.             throw new NotStrictlyPositiveException(LocalizedFormats.DIMENSION,
  79.                                                    numberOfElements);
  80.         }
  81.         if (exponent <= 0) {
  82.             throw new NotStrictlyPositiveException(LocalizedFormats.EXPONENT,
  83.                                                    exponent);
  84.         }

  85.         this.numberOfElements = numberOfElements;
  86.         this.exponent = exponent;
  87.     }

  88.     /**
  89.      * Get the number of elements (e.g. corpus size) for the distribution.
  90.      *
  91.      * @return the number of elements
  92.      */
  93.     public int getNumberOfElements() {
  94.         return numberOfElements;
  95.     }

  96.     /**
  97.      * Get the exponent characterizing the distribution.
  98.      *
  99.      * @return the exponent
  100.      */
  101.     public double getExponent() {
  102.         return exponent;
  103.     }

  104.     /** {@inheritDoc} */
  105.     public double probability(final int x) {
  106.         if (x <= 0 || x > numberOfElements) {
  107.             return 0.0;
  108.         }

  109.         return (1.0 / FastMath.pow(x, exponent)) / generalizedHarmonic(numberOfElements, exponent);
  110.     }

  111.     /** {@inheritDoc} */
  112.     @Override
  113.     public double logProbability(int x) {
  114.         if (x <= 0 || x > numberOfElements) {
  115.             return Double.NEGATIVE_INFINITY;
  116.         }

  117.         return -FastMath.log(x) * exponent - FastMath.log(generalizedHarmonic(numberOfElements, exponent));
  118.     }

  119.     /** {@inheritDoc} */
  120.     public double cumulativeProbability(final int x) {
  121.         if (x <= 0) {
  122.             return 0.0;
  123.         } else if (x >= numberOfElements) {
  124.             return 1.0;
  125.         }

  126.         return generalizedHarmonic(x, exponent) / generalizedHarmonic(numberOfElements, exponent);
  127.     }

  128.     /**
  129.      * {@inheritDoc}
  130.      *
  131.      * For number of elements {@code N} and exponent {@code s}, the mean is
  132.      * {@code Hs1 / Hs}, where
  133.      * <ul>
  134.      *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
  135.      *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
  136.      * </ul>
  137.      */
  138.     public double getNumericalMean() {
  139.         if (!numericalMeanIsCalculated) {
  140.             numericalMean = calculateNumericalMean();
  141.             numericalMeanIsCalculated = true;
  142.         }
  143.         return numericalMean;
  144.     }

  145.     /**
  146.      * Used by {@link #getNumericalMean()}.
  147.      *
  148.      * @return the mean of this distribution
  149.      */
  150.     protected double calculateNumericalMean() {
  151.         final int N = getNumberOfElements();
  152.         final double s = getExponent();

  153.         final double Hs1 = generalizedHarmonic(N, s - 1);
  154.         final double Hs = generalizedHarmonic(N, s);

  155.         return Hs1 / Hs;
  156.     }

  157.     /**
  158.      * {@inheritDoc}
  159.      *
  160.      * For number of elements {@code N} and exponent {@code s}, the mean is
  161.      * {@code (Hs2 / Hs) - (Hs1^2 / Hs^2)}, where
  162.      * <ul>
  163.      *  <li>{@code Hs2 = generalizedHarmonic(N, s - 2)},</li>
  164.      *  <li>{@code Hs1 = generalizedHarmonic(N, s - 1)},</li>
  165.      *  <li>{@code Hs = generalizedHarmonic(N, s)}.</li>
  166.      * </ul>
  167.      */
  168.     public double getNumericalVariance() {
  169.         if (!numericalVarianceIsCalculated) {
  170.             numericalVariance = calculateNumericalVariance();
  171.             numericalVarianceIsCalculated = true;
  172.         }
  173.         return numericalVariance;
  174.     }

  175.     /**
  176.      * Used by {@link #getNumericalVariance()}.
  177.      *
  178.      * @return the variance of this distribution
  179.      */
  180.     protected double calculateNumericalVariance() {
  181.         final int N = getNumberOfElements();
  182.         final double s = getExponent();

  183.         final double Hs2 = generalizedHarmonic(N, s - 2);
  184.         final double Hs1 = generalizedHarmonic(N, s - 1);
  185.         final double Hs = generalizedHarmonic(N, s);

  186.         return (Hs2 / Hs) - ((Hs1 * Hs1) / (Hs * Hs));
  187.     }

  188.     /**
  189.      * Calculates the Nth generalized harmonic number. See
  190.      * <a href="http://mathworld.wolfram.com/HarmonicSeries.html">Harmonic
  191.      * Series</a>.
  192.      *
  193.      * @param n Term in the series to calculate (must be larger than 1)
  194.      * @param m Exponent (special case {@code m = 1} is the harmonic series).
  195.      * @return the n<sup>th</sup> generalized harmonic number.
  196.      */
  197.     private double generalizedHarmonic(final int n, final double m) {
  198.         double value = 0;
  199.         for (int k = n; k > 0; --k) {
  200.             value += 1.0 / FastMath.pow(k, m);
  201.         }
  202.         return value;
  203.     }

  204.     /**
  205.      * {@inheritDoc}
  206.      *
  207.      * The lower bound of the support is always 1 no matter the parameters.
  208.      *
  209.      * @return lower bound of the support (always 1)
  210.      */
  211.     public int getSupportLowerBound() {
  212.         return 1;
  213.     }

  214.     /**
  215.      * {@inheritDoc}
  216.      *
  217.      * The upper bound of the support is the number of elements.
  218.      *
  219.      * @return upper bound of the support
  220.      */
  221.     public int getSupportUpperBound() {
  222.         return getNumberOfElements();
  223.     }

  224.     /**
  225.      * {@inheritDoc}
  226.      *
  227.      * The support of this distribution is connected.
  228.      *
  229.      * @return {@code true}
  230.      */
  231.     public boolean isSupportConnected() {
  232.         return true;
  233.     }
  234. }