[LON-CAPA-cvs] cvs: modules /gerd/harvesting guess_subject.pl
www
www at source.lon-capa.org
Wed Sep 7 11:01:12 EDT 2011
www Wed Sep 7 15:01:12 2011 EDT
Added files:
/modules/gerd/harvesting guess_subject.pl
Log:
Script to guess course topics
Index: modules/gerd/harvesting/guess_subject.pl
+++ modules/gerd/harvesting/guess_subject.pl
use strict;
my %words=('phy' => 'physics',
'cem' => 'chemistry',
'bio' => 'biology',
'chem' => 'chemistry',
'chm' => 'chemistry',
'geol' => 'geology',
'math' => 'mathematics',
'mth' => 'mathematics',
'stat' => 'statistics',
'stt' => 'statistics',
'geom' => 'geometry',
'ast' => 'astronomy',
'econo' => 'economy',
'ecolo' => 'ecology',
'cse' => 'computerscience',
'compu' => 'computerscience',
'informat' => 'computerscience',
'algorithm' => 'computerscience',
'biochem' => 'biochemistry',
'earth' => 'geology',
'elektrotechnik' => 'engineering',
'advert' => 'advertising',
'physio' => 'medicine',
'money' => 'finance',
'light' => 'physics',
'optics' => 'physics',
'digital' => 'computerscience',
'medien' => 'computerscience',
'calculus' => 'mathematics',
'algebra' => 'mathematics',
'medic' => 'medicine',
'mediz' => 'medicine',
'nur' => 'nursing',
'zoo' => 'zoology',
'philos' => 'philosophy',
'media' => 'computerscience',
'history' => 'history',
'genet' => 'biology',
'biophy' => 'biophysics',
'cell' => 'biology',
'mcb' => 'biology',
'organism' => 'biology',
'magneti' => 'physics',
'fisic' => 'physics',
'finan' => 'finance',
'universe' => 'astronomy',
'accoun' => 'accounting',
'probab' => 'statistics',
'wahrsch' => 'statistics',
'infect' => 'medicine',
'nerv' => 'medicine',
'anatomy' => 'biology',
'narco' => 'medicine',
'stoch' => 'statistics',
'hemato' => 'medicine',
'pulmo' => 'medicine',
'trigono' => 'geometry',
'programmier' => 'computerscience',
'programing' => 'computerscience',
'internet' => 'computerscience',
'urinar' => 'medicine',
'psy' => 'psychology',
'mechani' => 'physics',
'neurolo' => 'medicine',
'diskrete' => 'computerscience',
'metabo' => 'computerscience',
'betriebssys' => 'computerscience',
'internet' => 'computerscience',
'hhmi' => 'biology',
'skelet' => 'medicine',
'engi' => 'engineering');
my %excl=('calculus' => 'phys',
'algebra' => 'phys',
'engi' => 'phys',
'trigono' => 'phys',
'media' => 'intermedia',
'cem' => 'placem',
'stat' => 'state',
'ast' => 'stochast',
'phy' => 'sophy|physio');
open(IN,"course_titles.dat");
while (my $line=<IN>) {
chomp($line);
my ($id,$desc)=split(/\t/,$line);
my %subj=();
foreach my $word (keys %words) {
if ($excl{$word}) {
if ($desc=~/$excl{$word}/i) { next; }
}
if ($desc=~/\Q$word\E/i) {
$subj{$words{$word}}=1;
}
}
# my @subjects=sort(keys(%subj));
# if (($#subjects==-1) || ($#subjects>0)) {
print $id."\t".join(',',sort(keys(%subj)))."\n";
# }
}
More information about the LON-CAPA-cvs
mailing list