[LON-CAPA-cvs] cvs: modules /gerd/harvesting guess_subject.pl

Wed Sep 7 11:01:12 EDT 2011

www		Wed Sep  7 15:01:12 2011 EDT

  Added files:                 
    /modules/gerd/harvesting	guess_subject.pl 
  Log:
  Script to guess course topics
  
  

Index: modules/gerd/harvesting/guess_subject.pl
+++ modules/gerd/harvesting/guess_subject.pl
use strict;

my %words=('phy' => 'physics',
           'cem' => 'chemistry',
           'bio' => 'biology',
           'chem' => 'chemistry',
           'chm' => 'chemistry',
           'geol' => 'geology',
           'math' => 'mathematics',
           'mth' => 'mathematics',
           'stat' => 'statistics',
           'stt' => 'statistics',
           'geom' => 'geometry',
           'ast' => 'astronomy',
           'econo' => 'economy',
           'ecolo' => 'ecology',
           'cse' => 'computerscience',
           'compu' => 'computerscience',
           'informat' => 'computerscience',
           'algorithm' => 'computerscience',
           'biochem' => 'biochemistry',
           'earth' => 'geology',
           'elektrotechnik' => 'engineering',
           'advert' => 'advertising',
           'physio' => 'medicine',
           'money' => 'finance',
           'light' => 'physics',
           'optics' => 'physics',
           'digital' => 'computerscience',
           'medien' => 'computerscience',
           'calculus' => 'mathematics',
           'algebra' => 'mathematics',
           'medic' => 'medicine',
           'mediz' => 'medicine',
           'nur' => 'nursing',
           'zoo' => 'zoology',
           'philos' => 'philosophy',
           'media' => 'computerscience',
           'history' => 'history',
           'genet' => 'biology',
           'biophy' => 'biophysics',
           'cell' => 'biology',
           'mcb' => 'biology',
           'organism' => 'biology',
           'magneti' => 'physics',
           'fisic' => 'physics',
           'finan' => 'finance',
           'universe' => 'astronomy',
           'accoun' => 'accounting',
           'probab' => 'statistics',
           'wahrsch' => 'statistics',
           'infect' => 'medicine',
           'nerv' => 'medicine',
           'anatomy' => 'biology',
           'narco' => 'medicine',
           'stoch' => 'statistics',
           'hemato' => 'medicine',
           'pulmo' => 'medicine',
           'trigono' => 'geometry',
           'programmier' => 'computerscience',
           'programing' => 'computerscience',
           'internet' => 'computerscience',
           'urinar' => 'medicine',
           'psy' => 'psychology',
           'mechani' => 'physics',
           'neurolo' => 'medicine',
           'diskrete' => 'computerscience',
           'metabo' => 'computerscience',
           'betriebssys' => 'computerscience',
           'internet' => 'computerscience',
           'hhmi' => 'biology',
           'skelet' => 'medicine',
           'engi' => 'engineering');

my %excl=('calculus' => 'phys',
          'algebra' => 'phys',
          'engi' => 'phys',
          'trigono' => 'phys',
          'media' => 'intermedia',
          'cem' => 'placem',
          'stat' => 'state',
          'ast' => 'stochast',
          'phy' => 'sophy|physio');
          
open(IN,"course_titles.dat");
while (my $line=<IN>) {
   chomp($line);
   my ($id,$desc)=split(/\t/,$line);
   my %subj=();
   foreach my $word (keys %words) {
       if ($excl{$word}) {
          if ($desc=~/$excl{$word}/i) { next; }
       }
       if ($desc=~/\Q$word\E/i) {
          $subj{$words{$word}}=1;
       }
   }
#   my @subjects=sort(keys(%subj));
#   if (($#subjects==-1) || ($#subjects>0)) {
   print $id."\t".join(',',sort(keys(%subj)))."\n";
#   }
}