ROOT  6.06/09
Reference Guide
TCondor.cxx
Go to the documentation of this file.
1 // @(#)root/proof:$Id$
2 // Author: Maarten Ballintijn 06/12/03
3 
4 /*************************************************************************
5  * Copyright (C) 1995-2001, Rene Brun and Fons Rademakers. *
6  * All rights reserved. *
7  * *
8  * For the licensing terms see $ROOTSYS/LICENSE. *
9  * For the list of contributors see $ROOTSYS/README/CREDITS. *
10  *************************************************************************/
11 
12 //////////////////////////////////////////////////////////////////////////
13 // //
14 // TCondor //
15 // //
16 // Interface to the Condor system. TCondor provides a (partial) API for //
17 // querying and controlling the Condor system, including experimental //
18 // extensions like COD (computing on demand) //
19 // //
20 //////////////////////////////////////////////////////////////////////////
21 
22 #include <stdlib.h>
23 
24 #include "TCondor.h"
25 #include "TList.h"
26 #include "TSystem.h"
27 #include "TObjString.h"
28 #include "TRegexp.h"
29 #include "TProofDebug.h"
30 #include "Riostream.h"
31 #include "TEnv.h"
32 #include "TClass.h"
33 
36 
37 
38 ////////////////////////////////////////////////////////////////////////////////
39 /// Create Condor interface object. Uses Condor apps since there is no
40 /// API yet.
41 
42 TCondor::TCondor(const char *pool) : fPool(pool), fState(kFree)
43 {
44  fClaims = new TList;
45 
46  // Setup Condor
47 
48  TString condorHome = gEnv->GetValue("Proof.CondorHome", (char*)0);
49  if (condorHome != "") {
50  TString path = gSystem->Getenv("PATH");
51  path = condorHome + "/bin:" + path;
52  gSystem->Setenv("PATH",path);
53  }
54 
55  TString condorConf = gEnv->GetValue("Proof.CondorConfig", (char*)0);
56  if (condorConf != "") {
57  gSystem->Setenv("CONDOR_CONFIG",condorConf);
58  }
59 
60  char *loc = gSystem->Which(gSystem->Getenv("PATH"), "condor_cod",
62 
63  if (loc) {
64  fValid = kTRUE;
65  delete [] loc;
66  } else {
67  fValid = kFALSE;
68  }
69 }
70 
71 
72 ////////////////////////////////////////////////////////////////////////////////
73 /// Cleanup Condor interface.
74 
76 {
77  PDB(kCondor,1) Info("~TCondor","fState %d", fState );
78 
79  if (fState != kFree) {
80  Release();
81  }
82  delete fClaims;
83 }
84 
85 
86 ////////////////////////////////////////////////////////////////////////////////
87 /// Print master status
88 
89 void TCondor::Print(Option_t * opt) const
90 {
91  std::cout << "OBJ: " << IsA()->GetName()
92  << "\tPool: \"" << fPool << "\""
93  << "\tState: " << fState << std::endl;
94  fClaims->Print(opt);
95 }
96 
97 
98 ////////////////////////////////////////////////////////////////////////////////
99 /// Claim a VirtualMachine for PROOF usage.
100 
101 TCondorSlave *TCondor::ClaimVM(const char *vm, const char *cmd)
102 {
103 // TString reinitCmd = "KRB5CCNAME=FILE:/tmp/condor.$$ && /usr/krb5/bin/kinit -F -k -t /etc/cdfcaf.keytab cafuser/cdf/h2caf@FNAL.GOV";
104 // gSystem->Exec(reinitCmd.Data());
105  Int_t port = 0;
106 
107  TString claimCmd = Form("condor_cod request -name %s -timeout 10 2>>%s/condor.proof.%d",
108  vm, gSystem->TempDirectory(), gSystem->GetUid() );
109 
110  PDB(kCondor,2) Info("ClaimVM","command: %s", claimCmd.Data());
111  FILE *pipe = gSystem->OpenPipe(claimCmd, "r");
112 
113  if (!pipe) {
114  SysError("ClaimVM","cannot run command: %s", claimCmd.Data());
115  return 0;
116  }
117 
118  TString claimId;
119  TString line;
120  while (line.Gets(pipe)) {
121  PDB(kCondor,3) Info("ClaimVM","line = %s", line.Data());
122 
123  if (line.BeginsWith("ClaimId = \"")) {
124  line.Remove(0, line.Index("\"")+1);
125  line.Chop(); // remove trailing "
126  claimId = line;
127  PDB(kCondor,1) Info("ClaimVM","claim = '%s'", claimId.Data());
128  TRegexp r("[0-9]*$");
129  TString num = line(r);
130  port = 37000 + atoi(num.Data());
131  PDB(kCondor,1) Info("ClaimVM","port = %d", port);
132  }
133  }
134 
135  Int_t r = gSystem->ClosePipe(pipe);
136  if (r) {
137  Error("ClaimVM","command: %s returned %d", claimCmd.Data(), r);
138  return 0;
139  } else {
140  PDB(kCondor,1) Info("ClaimVM","command: %s returned %d", claimCmd.Data(), r);
141  }
142 
143  TString jobad("jobad");
144  FILE *jf = gSystem->TempFileName(jobad);
145 
146  if (jf == 0) return 0;
147 
148  TString str(cmd);
149  str.ReplaceAll("$(Port)", Form("%d", port));
150  fputs(str, jf);
151 
152  fclose(jf);
153 
154  TString activateCmd = Form("condor_cod activate -id '%s' -jobad %s",
155  claimId.Data(), jobad.Data() );
156 
157  PDB(kCondor,2) Info("ClaimVM","command: %s", activateCmd.Data());
158  pipe = gSystem->OpenPipe(activateCmd, "r");
159 
160  if (!pipe) {
161  SysError("ClaimVM","cannot run command: %s", activateCmd.Data());
162  return 0;
163  }
164 
165  while (line.Gets(pipe)) {
166  PDB(kCondor,3) Info("ClaimVM","Activate: line = %s", line.Data());
167  }
168 
169  r = gSystem->ClosePipe(pipe);
170  if (r) {
171  Error("ClaimVM","command: %s returned %d", activateCmd.Data(), r);
172  } else {
173  PDB(kCondor,1) Info("ClaimVM","command: %s returned %d", activateCmd.Data(), r);
174  }
175 
176  gSystem->Unlink(jobad);
177 
178  // TODO: get info at the start for all nodes ...
179  TCondorSlave *claim = new TCondorSlave;
180  claim->fClaimID = claimId;
181  TString node(vm);
182  node = node.Remove(0, node.Index("@")+1);
183  claim->fHostname = node;
184  claim->fPort = port;
185  claim->fPerfIdx = 100; //set performance index to 100 by default
186  claim->fImage = node; //set image to hostname by default
187 
188  return claim;
189 }
190 
191 
192 ////////////////////////////////////////////////////////////////////////////////
193 /// Get the names of the virtual machines in the pool.
194 /// Return a TList of TObjString or 0 in case of failure
195 
197 {
198  TString poolopt = fPool ? "" : Form("-pool %s", fPool.Data());
199  TString cmd = Form("condor_status %s -format \"%%s\\n\" Name", poolopt.Data());
200 
201  PDB(kCondor,2) Info("GetVirtualMachines","command: %s", cmd.Data());
202 
203  FILE *pipe = gSystem->OpenPipe(cmd, "r");
204 
205  if (!pipe) {
206  SysError("GetVirtualMachines","cannot run command: %s", cmd.Data());
207  return 0;
208  }
209 
210  TString line;
211  TList *l = new TList;
212  while (line.Gets(pipe)) {
213  PDB(kCondor,3) Info("GetVirtualMachines","line = %s", line.Data());
214  if (line != "") l->Add(new TObjString(line));
215  }
216 
217  Int_t r = gSystem->ClosePipe(pipe);
218  if (r) {
219  delete l;
220  Error("GetVirtualMachines","command: %s returned %d", cmd.Data(), r);
221  return 0;
222  } else {
223  PDB(kCondor,1) Info("GetVirtualMachines","command: %s returned %d", cmd.Data(), r);
224  }
225 
226  return l;
227 }
228 
229 
230 ////////////////////////////////////////////////////////////////////////////////
231 /// Claim n virtual machines
232 /// This function figures out the image and performance index before returning
233 /// the list of condor slaves
234 
235 TList *TCondor::Claim(Int_t n, const char *cmd)
236 {
237  if (fState != kFree) {
238  Error("Claim","not in state Free");
239  return 0;
240  }
241 
242  TList *vms = GetVirtualMachines();
243  TIter next(vms);
244  TObjString *vm;
245  for(Int_t i=0; i < n && (vm = (TObjString*) next()) != 0; i++ ) {
246  TCondorSlave *claim = ClaimVM(vm->GetName(), cmd);
247  if (claim != 0) {
248  if ( !GetVmInfo(vm->GetName(), claim->fImage, claim->fPerfIdx) ) {
249  // assume vm is gone
250  delete claim;
251  } else {
252  fClaims->Add(claim);
253  fState = kActive;
254  }
255  }
256  }
257 
258  return fClaims;
259 }
260 
261 
262 ////////////////////////////////////////////////////////////////////////////////
263 /// Claim virtual machine with name vmname
264 /// This function does not figure out the image and performance index before
265 /// returning the condor slave
266 
267 TCondorSlave *TCondor::Claim(const char *vmname, const char *cmd)
268 {
269  if (fState != kFree && fState != kActive) {
270  Error("Claim","not in state Free or Active");
271  return 0;
272  }
273 
274  TCondorSlave *claim = ClaimVM(vmname, cmd);
275  if (claim != 0) {
276  fClaims->Add(claim);
277  fState = kActive;
278  }
279 
280  return claim;
281 }
282 
283 
284 ////////////////////////////////////////////////////////////////////////////////
285 /// Set the state of workers
286 
288 {
289  PDB(kCondor,1) Info("SetState","state: %s (%lld)",
290  state == kSuspended ? "kSuspended" : "kActive", Long64_t(gSystem->Now()));
291  TIter next(fClaims);
292  TCondorSlave *claim;
293  while((claim = (TCondorSlave*) next()) != 0) {
294  TString cmd = Form("condor_cod %s -id '%s'",
295  state == kSuspended ? "suspend" : "resume",
296  claim->fClaimID.Data());
297 
298  PDB(kCondor,2) Info("SetState","command: %s", cmd.Data());
299  FILE *pipe = gSystem->OpenPipe(cmd, "r");
300 
301  if (!pipe) {
302  SysError("SetState","cannot run command: %s", cmd.Data());
303  return kFALSE;
304  }
305 
306  TString line;
307  while (line.Gets(pipe)) {
308  PDB(kCondor,3) Info("SetState","line = %s", line.Data());
309  }
310 
311  Int_t r = gSystem->ClosePipe(pipe);
312  if (r) {
313  Error("SetState","command: %s returned %d", cmd.Data(), r);
314  return kFALSE;
315  } else {
316  PDB(kCondor,1) Info("SetState","command: %s returned %d", cmd.Data(), r);
317  }
318  }
319 
320  fState = state;
321  return kTRUE;
322 }
323 
324 
325 ////////////////////////////////////////////////////////////////////////////////
326 /// Suspend worker
327 
329 {
330  if (fState != kActive) {
331  Error("Suspend","not in state Active");
332  return kFALSE;
333  }
334 
335  return SetState(kSuspended);
336 }
337 
338 
339 ////////////////////////////////////////////////////////////////////////////////
340 /// Resume worker
341 
343 {
344  if (fState != kSuspended) {
345  Error("Suspend","not in state Suspended");
346  return kFALSE;
347  }
348 
349  return SetState(kActive);
350 }
351 
352 
353 ////////////////////////////////////////////////////////////////////////////////
354 /// Release worker
355 
357 {
358  if (fState == kFree) {
359  Error("Suspend","not in state Active or Suspended");
360  return kFALSE;
361  }
362 
363  TCondorSlave *claim;
364  while((claim = (TCondorSlave*) fClaims->First()) != 0) {
365  TString cmd = Form("condor_cod release -id '%s'", claim->fClaimID.Data());
366 
367  PDB(kCondor,2) Info("SetState","command: %s", cmd.Data());
368  FILE *pipe = gSystem->OpenPipe(cmd, "r");
369 
370  if (!pipe) {
371  SysError("Release","cannot run command: %s", cmd.Data());
372  return kFALSE;
373  }
374 
375  TString line;
376  while (line.Gets(pipe)) {
377  PDB(kCondor,3) Info("Release","line = %s", line.Data());
378  }
379 
380  Int_t r = gSystem->ClosePipe(pipe);
381  if (r) {
382  Error("Release","command: %s returned %d", cmd.Data(), r);
383  return kFALSE;
384  } else {
385  PDB(kCondor,1) Info("Release","command: %s returned %d", cmd.Data(), r);
386  }
387 
388  fClaims->Remove(claim);
389  delete claim;
390  }
391 
392  fState = kFree;
393  return kTRUE;
394 }
395 
396 
397 ////////////////////////////////////////////////////////////////////////////////
398 /// Get info about worker status
399 
400 Bool_t TCondor::GetVmInfo(const char *vm, TString &image, Int_t &perfidx) const
401 {
402  TString cmd = Form("condor_status -format \"%%d:\" Mips -format \"%%s\\n\" FileSystemDomain "
403  "-const 'Name==\"%s\"'", vm);
404 
405  PDB(kCondor,2) Info("GetVmInfo","command: %s", cmd.Data());
406  FILE *pipe = gSystem->OpenPipe(cmd, "r");
407 
408  if (!pipe) {
409  SysError("GetVmInfo","cannot run command: %s", cmd.Data());
410  return kFALSE;
411  }
412 
413  TString line;
414  while (line.Gets(pipe)) {
415  PDB(kCondor,3) Info("GetVmInfo","line = %s", line.Data());
416  if (line != "") {
417  TString amips = line(TRegexp("^[0-9]*"));
418  perfidx = atoi(amips);
419  image = line(TRegexp("[^:]+$"));
420  break;
421  }
422  }
423 
424  Int_t r = gSystem->ClosePipe(pipe);
425  if (r) {
426  Error("GetVmInfo","command: %s returned %d", cmd.Data(), r);
427  return kFALSE;
428  } else {
429  PDB(kCondor,1) Info("GetVmInfo","command: %s returned %d", cmd.Data(), r);
430  }
431 
432  return kTRUE;
433 }
434 
435 
436 ////////////////////////////////////////////////////////////////////////////////
437 /// Get image of the worker
438 
439 TString TCondor::GetImage(const char *host) const
440 {
441  TString cmd = Form("condor_status -direct %s -format \"Image:%%s\\n\" "
442  "FileSystemDomain", host);
443 
444  PDB(kCondor,2) Info("GetImage","command: %s", cmd.Data());
445 
446  FILE *pipe = gSystem->OpenPipe(cmd, "r");
447 
448  if (!pipe) {
449  SysError("GetImage","cannot run command: %s", cmd.Data());
450  return "";
451  }
452 
453  TString image;
454  TString line;
455  while (line.Gets(pipe)) {
456  PDB(kCondor,3) Info("GetImage","line = %s", line.Data());
457  if (line != "") {
458  image = line(TRegexp("[^:]+$"));
459  break;
460  }
461  }
462 
463  Int_t r = gSystem->ClosePipe(pipe);
464  if (r) {
465  Error("GetImage","command: %s returned %d", cmd.Data(), r);
466  return "";
467  } else {
468  PDB(kCondor,1) Info("GetImage","command: %s returned %d", cmd.Data(), r);
469  }
470 
471  return image;
472 }
473 
474 
475 ////////////////////////////////////////////////////////////////////////////////
476 /// Print worker status
477 
478 void TCondorSlave::Print(Option_t * /*opt*/ ) const
479 {
480  std::cout << "OBJ: " << IsA()->GetName()
481  << " " << fHostname << ":" << fPort
482  << " Perf: " << fPerfIdx
483  << " Image: " << fImage << std::endl;
484 }
Bool_t Release()
Release worker.
Definition: TCondor.cxx:356
Bool_t Suspend()
Suspend worker.
Definition: TCondor.cxx:328
long long Long64_t
Definition: RtypesCore.h:69
ClassImp(TSeqCollection) Int_t TSeqCollection TIter next(this)
Return index of object in collection.
TLine * line
Collectable string class.
Definition: TObjString.h:32
const char Option_t
Definition: RtypesCore.h:62
Bool_t SetState(EState state)
Set the state of workers.
Definition: TCondor.cxx:287
TString & ReplaceAll(const TString &s1, const TString &s2)
Definition: TString.h:635
Bool_t GetVmInfo(const char *vm, TString &image, Int_t &perfidx) const
Get info about worker status.
Definition: TCondor.cxx:400
virtual void Info(const char *method, const char *msgfmt,...) const
Issue info message.
Definition: TObject.cxx:892
Regular expression class.
Definition: TRegexp.h:35
TList * GetVirtualMachines() const
Get the names of the virtual machines in the pool.
Definition: TCondor.cxx:196
virtual const char * TempDirectory() const
Return a user configured or systemwide directory to create temporary files in.
Definition: TSystem.cxx:1447
Basic string class.
Definition: TString.h:137
int Int_t
Definition: RtypesCore.h:41
bool Bool_t
Definition: RtypesCore.h:59
const Bool_t kFALSE
Definition: Rtypes.h:92
virtual char * Which(const char *search, const char *file, EAccessMode mode=kFileExists)
Find location of file in a search path.
Definition: TSystem.cxx:1511
virtual FILE * OpenPipe(const char *command, const char *mode)
Open a pipe.
Definition: TSystem.cxx:666
Bool_t BeginsWith(const char *s, ECaseCompare cmp=kExact) const
Definition: TString.h:558
const char * Data() const
Definition: TString.h:349
virtual int Unlink(const char *name)
Unlink, i.e. remove, a file.
Definition: TSystem.cxx:1346
void Print(Option_t *option="") const
Print master status.
Definition: TCondor.cxx:89
virtual FILE * TempFileName(TString &base, const char *dir=0)
Create a secure temporary file by appending a unique 6 letter string to base.
Definition: TSystem.cxx:1462
#define PDB(mask, level)
Definition: TProofDebug.h:58
TString fClaimID
Definition: TCondor.h:44
void Print(Option_t *option="") const
Print worker status.
Definition: TCondor.cxx:478
virtual const char * Getenv(const char *env)
Get environment variable.
Definition: TSystem.cxx:1627
TList * Claim(Int_t n, const char *cmd)
Claim n virtual machines This function figures out the image and performance index before returning t...
Definition: TCondor.cxx:235
EState
Definition: TCondor.h:58
virtual void Error(const char *method, const char *msgfmt,...) const
Issue error message.
Definition: TObject.cxx:918
EState fState
Definition: TCondor.h:64
A doubly linked list.
Definition: TList.h:47
TString fHostname
Definition: TCondor.h:40
virtual TTime Now()
Get current time in milliseconds since 0:00 Jan 1 1995.
Definition: TSystem.cxx:467
virtual void Setenv(const char *name, const char *value)
Set environment variable.
Definition: TSystem.cxx:1611
ROOT::R::TRInterface & r
Definition: Object.C:4
R__EXTERN TSystem * gSystem
Definition: TSystem.h:549
virtual Int_t GetValue(const char *name, Int_t dflt)
Returns the integer value for a resource.
Definition: TEnv.cxx:494
virtual TObject * Remove(TObject *obj)
Remove object from the list.
Definition: TList.cxx:674
Bool_t Gets(FILE *fp, Bool_t chop=kTRUE)
Read one line from the stream, including the , or until EOF.
Definition: Stringio.cxx:198
TClass * IsA() const
char * Form(const char *fmt,...)
TLine * l
Definition: textangle.C:4
virtual const char * GetName() const
Returns name of object.
Definition: TNamed.h:51
virtual int ClosePipe(FILE *pipe)
Close the pipe.
Definition: TSystem.cxx:675
virtual ~TCondor()
Cleanup Condor interface.
Definition: TCondor.cxx:75
virtual void Print(Option_t *option="") const
Default print for collections, calls Print(option, 1).
TString & Remove(Ssiz_t pos)
Definition: TString.h:616
virtual void SysError(const char *method, const char *msgfmt,...) const
Issue system error message.
Definition: TObject.cxx:932
R__EXTERN TEnv * gEnv
Definition: TEnv.h:174
TList * fClaims
Definition: TCondor.h:65
TString GetImage(const char *host) const
Get image of the worker.
Definition: TCondor.cxx:439
virtual Int_t GetUid(const char *user=0)
Returns the user's id. If user = 0, returns current user's id.
Definition: TSystem.cxx:1524
virtual TObject * First() const
Return the first object in the list. Returns 0 when list is empty.
Definition: TList.cxx:556
TString fPool
Definition: TCondor.h:63
virtual void Add(TObject *obj)
Definition: TList.h:81
Int_t fPort
Definition: TCondor.h:41
Ssiz_t Index(const char *pat, Ssiz_t i=0, ECaseCompare cmp=kExact) const
Definition: TString.h:582
ClassImp(TCondorSlave) ClassImp(TCondor) TCondor
Create Condor interface object.
Definition: TCondor.cxx:34
const Bool_t kTRUE
Definition: Rtypes.h:91
TCondorSlave * ClaimVM(const char *vm, const char *cmd)
Claim a VirtualMachine for PROOF usage.
Definition: TCondor.cxx:101
Bool_t Resume()
Resume worker.
Definition: TCondor.cxx:342
const Int_t n
Definition: legend1.C:16
TString fImage
Definition: TCondor.h:43
TString & Chop()
Definition: TString.h:622
const char Int_t const char * image
Definition: TXSlave.cxx:46
Int_t fPerfIdx
Definition: TCondor.h:42