Changeset 45253

Show
Ignore:
Timestamp:
05/11/08 16:55:03 (2 months ago)
Author:
RobMarsh
Message:

improvements to Similar Posts matching
experiment with Chinese/Korean/Japanese matching

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • similar-posts/trunk/readme.txt

    r45217 r45253  
    55Requires at least: 1.5 
    66Tested up to: 2.5.1 
    7 Stable tag: 2.5b27 
     7Stable tag: 2.5b28 
    88Displays a list of posts similar to the current one based on content, title and/or tags. 
    99 
     
    3232== Version History == 
    3333 
     34* 2.5b28 
     35      * improvements to Similar Posts matching 
     36      * experiment with Chinese/Korean/Japanese matching 
    3437* 2.5b27 
    3538      * fixed bug with bulk indexing of tags 
  • similar-posts/trunk/similar-posts-admin.php

    r45217 r45253  
    11<?php 
    22 
    3 // Admin stuff for Similar Posts Plugin, Version 2.5b27 
     3// Admin stuff for Similar Posts Plugin, Version 2.5b28 
    44 
    55function similar_posts_option_menu() { 
     
    216216                  $options['utf8'] = 'false'; 
    217217            } 
     218            $options['cjk'] = $_POST['cjk']; 
     219            if (!function_exists('mb_internal_encoding')) { 
     220                  $options['cjk'] = 'false'; 
     221            } 
    218222            $options['use_stemmer'] = $_POST['use_stemmer']; 
    219223            $options['batch'] = ppl_check_cardinal($_POST['batch']); 
    220224            if ($options['batch'] === 0) $options['batch'] = 100; 
    221225            flush(); 
    222             $termcount = save_index_entries (($options['utf8']==='true'), ($options['use_stemmer']==='true'), $options['batch']); 
     226            $termcount = save_index_entries (($options['utf8']==='true'), ($options['use_stemmer']==='true'), $options['batch'], ($options['cjk']==='true')); 
    223227            update_option('similar-posts', $options); 
    224228            //show a message 
     
    234238                  The index is created when the plugin is activated and then kept up-to-date  
    235239                  automatically when posts are added, edited, or deleted.</p> 
    236                   <p>The two options that affect the index can be set below.</p>', 'post_plugins'); 
     240                  <p>The options that affect the index can be set below.</p>', 'post_plugins'); 
    237241            _e('<p>If you are using a language other than english you may find that the plugin  
    238242                  mangles some characters since PHP is normally blind to multibyte characters. You  
     
    240244                  of a little speed. <em>This facility is only available if your  
    241245                  installation of PHP supports the mbstring functions.</em></p>', 'post_plugins'); 
     246            _e('<p>Languages like Chinese, Korean and Japanese pose a special difficulty for 
     247                  the full-text search algorithm. As an <em>experiment</em> I have introduced an option below  
     248                  to work around some of these issues. The text must be encoded as UTF-8. I would be very grateful for feedback from any 
     249                  users knowledgeable in these languages.</em></p>', 'post_plugins'); 
    242250            _e('<p>Some related word forms should really be counted together, e.g., "follow",  
    243251                  "follows", and "following". The plugin can use a <em>stemming</em> algorithm to 
     
    261269                              <option <?php if($options['utf8'] == 'false') { echo 'selected="selected"'; } ?> value="false">No</option> 
    262270                              <option <?php if($options['utf8'] == 'true') { echo 'selected="selected"'; } ?> value="true">Yes</option> 
     271                              </select> 
     272                        </td>  
     273                  </tr> 
     274                  <tr valign="top"> 
     275                        <th scope="row"><?php _e('Treat as Chinese, Korean, or Japanese?', 'post_plugins') ?></th> 
     276                        <td> 
     277                              <select name="cjk" id="cjk" <?php if (!function_exists('mb_split')) echo 'disabled="true"'; ?> > 
     278                              <option <?php if($options['cjk'] == 'false') { echo 'selected="selected"'; } ?> value="false">No</option> 
     279                              <option <?php if($options['cjk'] == 'true') { echo 'selected="selected"'; } ?> value="true">Yes</option> 
    263280                              </select> 
    264281                        </td>  
     
    493510 
    494511// sets up the index for the blog 
    495 function save_index_entries ($utf8=false, $use_stemmer=false, $batch=100) { 
     512function save_index_entries ($utf8=false, $use_stemmer=false, $batch=100, $cjk=false) { 
    496513      global $wpdb, $table_prefix; 
    497514      $table_name = $table_prefix.'similar_posts'; 
     
    503520            reset($posts); 
    504521            while (list($dummy, $post) = each($posts)) { 
    505                   $content = sp_get_post_terms($post['post_content'], $utf, $use_stemmer); 
    506                   $title = sp_get_title_terms($post['post_title'], $utf, $use_stemmer); 
     522                  $content = sp_get_post_terms($post['post_content'], $utf8, $use_stemmer, $cjk); 
     523                  $title = sp_get_title_terms($post['post_title'], $utf8, $use_stemmer, $cjk); 
    507524                  $postID = $post['ID']; 
    508                   $tags = sp_get_tag_terms($postID, $utf); 
     525                  $tags = sp_get_tag_terms($postID, $utf8); 
    509526                  $wpdb->query("INSERT INTO `$table_name` (pID, content, title, tags) VALUES ($postID, \"$content\", \"$title\", \"$tags\")"); 
    510527                  $termcount = $termcount + 1; 
     
    631648      if (!isset($options['utf8'])) $options['utf8'] = 'false'; 
    632649      if (!function_exists('mb_internal_encoding')) $options['utf8'] = 'false'; 
     650      if (!isset($options['cjk'])) $options['cjk'] = 'false'; 
     651      if (!function_exists('mb_internal_encoding')) $options['cjk'] = 'false'; 
    633652      if (!isset($options['use_stemmer'])) $options['use_stemmer'] = 'false'; 
    634653      if (!isset($options['batch'])) $options['batch'] = '100'; 
     
    638657      // initial creation of the index, if the table is empty 
    639658      $num_index_posts = $wpdb->get_var("SELECT COUNT(*) FROM `$table_name`"); 
    640       if ($num_index_posts == 0) save_index_entries (($options['utf8'] === 'true'), false);    
     659      if ($num_index_posts == 0) save_index_entries (($options['utf8'] === 'true'), false, $options['batch'], ($options['cjk'] === 'true'));   
    641660 
    642661      // deactivate legacy Similar Posts Feed if present 
  • similar-posts/trunk/similar-posts.php

    r45217 r45253  
    44Plugin URI: http://rmarsh.com/plugins/similar-posts/ 
    55Description: Displays a <a href="options-general.php?page=similar-posts.php">highly configurable</a> list of related posts. Similarity can be based on any combination of word usage in the content, title, or tags. Don't be disturbed if it takes a few moments to complete the installation -- the plugin is indexing your posts. <a href="http://rmarsh.com/plugins/post-options/">Instructions and help online</a>. Requires the latest version of the <a href="http://wordpress.org/extend/plugins/post-plugin-library/">Post-Plugin Library</a> to be installed. 
    6 Version: 2.5b27 
     6Version: 2.5b28 
    77Author: Rob Marsh, SJ 
    88Author URI: http://rmarsh.com/ 
     
    2323*/ 
    2424 
    25 $similar_posts_version = $similar_posts_feed_version= '2.5b27'; 
     25$similar_posts_version = $similar_posts_feed_version= '2.5b28'; 
    2626 
    2727/* 
     
    291291      $options = get_option('similar-posts'); 
    292292      $utf8 = ($options['utf8'] === 'true'); 
     293      $cjk = ($options['cjk'] === 'true'); 
    293294      $use_stemmer = ($options['use_stemmer'] === 'true'); 
    294       $content = sp_get_post_terms($post['post_content'], $utf8, $use_stemmer); 
    295       $title = sp_get_title_terms($post['post_title'], $utf8, $use_stemmer); 
     295      $content = sp_get_post_terms($post['post_content'], $utf8, $use_stemmer, $cjk); 
     296      $title = sp_get_title_terms($post['post_title'], $utf8, $use_stemmer, $cjk); 
    296297      $tags = sp_get_tag_terms($postID, $utf8); 
    297298      //check to see if the field is set 
     
    340341} 
    341342 
    342 function sp_get_post_terms($text, $utf8, $use_stemmer) { 
     343// takes a cjk string and insert spaces between each character -- any ascii text is left unspaced 
     344function sp_cjk_spacer($string) { 
     345      mb_internal_encoding("UTF-8"); 
     346    $strlen = mb_strlen($string); 
     347      $ascii = ''; 
     348      $result = array(); 
     349      for ($i = 0; $i < $strlen; $i++) { 
     350            $c = mb_substr($string, $i, 1); 
     351            if (strlen($c) > 1) { 
     352                  if ($ascii) { 
     353                        $result[] = $ascii; 
     354                        $ascii = ''; 
     355                  }      
     356                  $result[] = sp_mb_str_pad($c, 4, '_'); 
     357            } else { 
     358                  $ascii .= $c; 
     359            } 
     360    } 
     361      if ($ascii) $result[] = $ascii; 
     362    return implode(' ', $result); 
     363
     364 
     365function sp_get_post_terms($text, $utf8, $use_stemmer, $cjk) { 
     366      if ($cjk) return sp_cjk_spacer(sp_mb_clean_words($text));    
    343367      global $overusedwords; 
    344       if ($utf) { 
     368      if ($utf8) { 
    345369            if ($use_stemmer) { 
    346370                  mb_regex_encoding('UTF-8'); 
     
    401425$tinywords = array('the' => 1, 'and' => 1, 'of' => 1, 'a' => 1, 'for' => 1, 'on' => 1); 
    402426 
    403 function sp_get_title_terms($text, $utf8, $use_stemmer) { 
     427function sp_get_title_terms($text, $utf8, $use_stemmer, $cjk) { 
    404428      global $tinywords; 
    405       if ($utf) { 
     429      if ($cjk) return sp_cjk_spacer(sp_mb_clean_words($text));    
     430      if ($utf8) { 
    406431            if ($use_stemmer) { 
    407432                  mb_regex_encoding('UTF-8');