// (C) Copyright 2008-2009 SDML (www.sdml.info) // // Use, modification and distribution is subject to the Boost Software // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) #ifndef SUFFIX_ARRAY_HPP #define SUFFIX_ARRAY_HPP #include #include #include #include #include #include #include #include "suffix_compare.hpp" //#include "suffix_array_iterator.hpp" /** * @todo Templatize. * @todo Implement find, count, etc. * @todo Use a policy class to implement suffix arrays both with and without * a 'height array,' which speeds up several operations at the expense * of requiring additional storage space (sizeof(size_type) * n). * This implementation will currently work without it, this will be * separated into the non-height-array using version later. */ namespace gpld { /** * Implements a suffix array class for an arbitrary type which currently isn't * arbitrary in the slightest. * */ class suffix_array { public: typedef std::string value_type; typedef value_type::const_iterator value_const_iterator; typedef value_type::iterator value_iterator; typedef std::allocator allocator_type; typedef std::vector container_type; typedef container_type::const_iterator container_const_iterator; typedef container_type::iterator container_iterator; typedef std::size_t size_type; typedef std::pair range_pair; typedef container_const_iterator const_iterator; typedef const_iterator iterator; suffix_array(); explicit suffix_array(value_type const& string); suffix_array(suffix_array const& other); suffix_array(suffix_array && other); ~suffix_array(); suffix_array& swap(suffix_array && other); suffix_array& operator=(suffix_array const& other); suffix_array& operator=(suffix_array && other); inline value_type const& str() const; inline value_const_iterator at (size_type idx) const; inline value_const_iterator operator[](size_type idx) const; size_type lcp(size_type a, size_type b) const; std::pair longest_common_substring() const; range_pair find(value_type const& needle) const; size_type count(value_type const& needle) const; inline void clear(); inline bool empty() const; inline size_type size() const; void write() const; void read(); private: value_type _string; container_type _indices; }; /** The default constructor. */ suffix_array::suffix_array() : _string(), _indices() {} /** * The value constructor. * Parses the given string. * @todo: Optimize, separate from constructor. */ suffix_array::suffix_array(value_type const& string) : _string(string), _indices() { // First, insert all of the relevant indices. for (value_const_iterator i = _string.begin(); i != _string.end(); ++i) { _indices.push_back(i); } // Use our suffix comparison functor suffix_compare comp(_string.end()); // Sort suffices lexicographically. std::sort(_indices.begin(), _indices.end(), comp); } /** The copy constructor. */ suffix_array::suffix_array(suffix_array const& other) : _string(other._string), _indices(other._indices) { } /** The move constructor. */ suffix_array::suffix_array(suffix_array && other) : _string(std::move(other._string)), _indices(std::move(other._indices)) { } /** The destructor. */ suffix_array::~suffix_array() { } /** Swaps a suffix array with another. */ suffix_array& suffix_array::swap(suffix_array && other) { using std::swap; std::swap(_string, other._string); std::swap(_indices, other._indices); return *this; } /** Copy assignment. */ suffix_array& suffix_array::operator=(suffix_array const& other) { return swap(suffix_array(other)); } /** Move assignment. */ suffix_array& suffix_array::operator=(suffix_array && other) { clear(); return swap(other); } /** Returns an iterator to the first suffix. */ /*suffix_array::iterator suffix_array::begin() const { return iterator(_indices.begin(), _string.begin()); }*/ /** Returns an iterator to one past the last suffix. */ /*suffix_array::iterator suffix_array::end() const { return iterator(_indices.end(), _string.begin()); }*/ /** Returns a reference to the underlying string object of the suffix. */ suffix_array::value_type const& suffix_array::str() const { return _string; } /** Returns the starting position of the nth suffix, with bounds checking. */ suffix_array::value_const_iterator suffix_array::at(size_type const idx) const { return _indices.at(idx); } /** Returns the starting position of the nth suffix, without bounds checking. */ suffix_array::value_const_iterator suffix_array::operator[](size_type const idx) const { return _indices[idx]; } /** Returns the length of the longest common prefix of the two suffices * ordered at a and b. * @remarks No bounds checking is performed on _indices. */ suffix_array::size_type suffix_array::lcp(size_type const a, size_type const b) const { return std::distance(_indices[a], std::mismatch(_indices[a], _string.end(), _indices[b]).first); } /** Finds the longest recurring substring in the supplied object. * @returns a pair in which the first element is the length, * and the second element is an index within the string starting * with the sequence. * * @todo height array version */ std::pair< suffix_array::size_type, suffix_array::value_const_iterator> suffix_array::longest_common_substring() const { // The previous record. size_type length = 0; value_const_iterator index; // Loop through all but the first suffix. for (size_type i = 1; i < size(); ++i) { // Find number of common characters with previous entry size_type s = lcp(i-1, i); // If greater than the previous record if (s > length) { length = s; index = _indices[i]; } } return std::make_pair(length, index); } /** * Finds the index of the first substring that begins with a particular * string. */ suffix_array::range_pair suffix_array::find(suffix_array::value_type const& needle) const { return std::make_pair(_indices.end(), _indices.end()); } /** * Clears the contents of the suffix array. * * Currently this is done by swapping a defaultly-constructed suffix array, * since this method does not depend on a clear() method being present * in the value type. */ void suffix_array::clear() { swap(suffix_array()); } /** Returns whether or not the suffix array is empty. */ bool suffix_array::empty() const { return _indices.empty(); } /** Returns the length of the underlying 'string.' */ suffix_array::size_type suffix_array::size() const { /// @todo Don't assume size() exists for contained object. return _string.size(); } } // end of GPLD namespace #endif