Logo ROOT   6.16/01
Reference Guide
TCondor.cxx
Go to the documentation of this file.
1// @(#)root/proof:$Id$
2// Author: Maarten Ballintijn 06/12/03
3
4/*************************************************************************
5 * Copyright (C) 1995-2001, Rene Brun and Fons Rademakers. *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12//////////////////////////////////////////////////////////////////////////
13// //
14// TCondor //
15// //
16// Interface to the Condor system. TCondor provides a (partial) API for //
17// querying and controlling the Condor system, including experimental //
18// extensions like COD (computing on demand) //
19// //
20//////////////////////////////////////////////////////////////////////////
21
22#include <stdlib.h>
23
24#include "TCondor.h"
25#include "TList.h"
26#include "TSystem.h"
27#include "TObjString.h"
28#include "TRegexp.h"
29#include "TProofDebug.h"
30#include "Riostream.h"
31#include "TEnv.h"
32#include "TClass.h"
33
36
37
38////////////////////////////////////////////////////////////////////////////////
39/// Create Condor interface object. Uses Condor apps since there is no
40/// API yet.
41
42TCondor::TCondor(const char *pool) : fPool(pool), fState(kFree)
43{
44 fClaims = new TList;
45
46 // Setup Condor
47
48 TString condorHome = gEnv->GetValue("Proof.CondorHome", (char*)0);
49 if (condorHome != "") {
50 TString path = gSystem->Getenv("PATH");
51 path = condorHome + "/bin:" + path;
52 gSystem->Setenv("PATH",path);
53 }
54
55 TString condorConf = gEnv->GetValue("Proof.CondorConfig", (char*)0);
56 if (condorConf != "") {
57 gSystem->Setenv("CONDOR_CONFIG",condorConf);
58 }
59
60 char *loc = gSystem->Which(gSystem->Getenv("PATH"), "condor_cod",
62
63 if (loc) {
64 fValid = kTRUE;
65 delete [] loc;
66 } else {
67 fValid = kFALSE;
68 }
69}
70
71
72////////////////////////////////////////////////////////////////////////////////
73/// Cleanup Condor interface.
74
76{
77 PDB(kCondor,1) Info("~TCondor","fState %d", fState );
78
79 if (fState != kFree) {
80 Release();
81 }
82 delete fClaims;
83}
84
85
86////////////////////////////////////////////////////////////////////////////////
87/// Print master status
88
89void TCondor::Print(Option_t * opt) const
90{
91 std::cout << "OBJ: " << IsA()->GetName()
92 << "\tPool: \"" << fPool << "\""
93 << "\tState: " << fState << std::endl;
94 fClaims->Print(opt);
95}
96
97
98////////////////////////////////////////////////////////////////////////////////
99/// Claim a VirtualMachine for PROOF usage.
100
101TCondorSlave *TCondor::ClaimVM(const char *vm, const char *cmd)
102{
103// TString reinitCmd = "KRB5CCNAME=FILE:/tmp/condor.$$ && /usr/krb5/bin/kinit -F -k -t /etc/cdfcaf.keytab cafuser/cdf/h2caf@FNAL.GOV";
104// gSystem->Exec(reinitCmd.Data());
105 Int_t port = 0;
106
107 TString claimCmd = Form("condor_cod request -name %s -timeout 10 2>>%s/condor.proof.%d",
108 vm, gSystem->TempDirectory(), gSystem->GetUid() );
109
110 PDB(kCondor,2) Info("ClaimVM","command: %s", claimCmd.Data());
111 FILE *pipe = gSystem->OpenPipe(claimCmd, "r");
112
113 if (!pipe) {
114 SysError("ClaimVM","cannot run command: %s", claimCmd.Data());
115 return 0;
116 }
117
118 TString claimId;
120 while (line.Gets(pipe)) {
121 PDB(kCondor,3) Info("ClaimVM","line = %s", line.Data());
122
123 if (line.BeginsWith("ClaimId = \"")) {
124 line.Remove(0, line.Index("\"")+1);
125 line.Chop(); // remove trailing "
126 claimId = line;
127 PDB(kCondor,1) Info("ClaimVM","claim = '%s'", claimId.Data());
128 TRegexp r("[0-9]*$");
129 TString num = line(r);
130 port = 37000 + atoi(num.Data());
131 PDB(kCondor,1) Info("ClaimVM","port = %d", port);
132 }
133 }
134
135 Int_t r = gSystem->ClosePipe(pipe);
136 if (r) {
137 Error("ClaimVM","command: %s returned %d", claimCmd.Data(), r);
138 return 0;
139 } else {
140 PDB(kCondor,1) Info("ClaimVM","command: %s returned %d", claimCmd.Data(), r);
141 }
142
143 TString jobad("jobad");
144 FILE *jf = gSystem->TempFileName(jobad);
145
146 if (jf == 0) return 0;
147
148 TString str(cmd);
149 str.ReplaceAll("$(Port)", Form("%d", port));
150 fputs(str, jf);
151
152 fclose(jf);
153
154 TString activateCmd = Form("condor_cod activate -id '%s' -jobad %s",
155 claimId.Data(), jobad.Data() );
156
157 PDB(kCondor,2) Info("ClaimVM","command: %s", activateCmd.Data());
158 pipe = gSystem->OpenPipe(activateCmd, "r");
159
160 if (!pipe) {
161 SysError("ClaimVM","cannot run command: %s", activateCmd.Data());
162 return 0;
163 }
164
165 while (line.Gets(pipe)) {
166 PDB(kCondor,3) Info("ClaimVM","Activate: line = %s", line.Data());
167 }
168
169 r = gSystem->ClosePipe(pipe);
170 if (r) {
171 Error("ClaimVM","command: %s returned %d", activateCmd.Data(), r);
172 } else {
173 PDB(kCondor,1) Info("ClaimVM","command: %s returned %d", activateCmd.Data(), r);
174 }
175
176 gSystem->Unlink(jobad);
177
178 // TODO: get info at the start for all nodes ...
179 TCondorSlave *claim = new TCondorSlave;
180 claim->fClaimID = claimId;
181 TString node(vm);
182 node = node.Remove(0, node.Index("@")+1);
183 claim->fHostname = node;
184 claim->fPort = port;
185 claim->fPerfIdx = 100; //set performance index to 100 by default
186 claim->fImage = node; //set image to hostname by default
187
188 return claim;
189}
190
191
192////////////////////////////////////////////////////////////////////////////////
193/// Get the names of the virtual machines in the pool.
194/// Return a TList of TObjString or 0 in case of failure
195
197{
198 TString poolopt = fPool ? Form("-pool %s", fPool.Data()) : "";
199 TString cmd = Form("condor_status %s -format \"%%s\\n\" Name", poolopt.Data());
200
201 PDB(kCondor,2) Info("GetVirtualMachines","command: %s", cmd.Data());
202
203 FILE *pipe = gSystem->OpenPipe(cmd, "r");
204
205 if (!pipe) {
206 SysError("GetVirtualMachines","cannot run command: %s", cmd.Data());
207 return 0;
208 }
209
211 TList *l = new TList;
212 while (line.Gets(pipe)) {
213 PDB(kCondor,3) Info("GetVirtualMachines","line = %s", line.Data());
214 if (line != "") l->Add(new TObjString(line));
215 }
216
217 Int_t r = gSystem->ClosePipe(pipe);
218 if (r) {
219 delete l;
220 Error("GetVirtualMachines","command: %s returned %d", cmd.Data(), r);
221 return 0;
222 } else {
223 PDB(kCondor,1) Info("GetVirtualMachines","command: %s returned %d", cmd.Data(), r);
224 }
225
226 return l;
227}
228
229
230////////////////////////////////////////////////////////////////////////////////
231/// Claim n virtual machines
232/// This function figures out the image and performance index before returning
233/// the list of condor slaves
234
235TList *TCondor::Claim(Int_t n, const char *cmd)
236{
237 if (fState != kFree) {
238 Error("Claim","not in state Free");
239 return 0;
240 }
241
242 TList *vms = GetVirtualMachines();
243 TIter next(vms);
244 TObjString *vm;
245 for(Int_t i=0; i < n && (vm = (TObjString*) next()) != 0; i++ ) {
246 TCondorSlave *claim = ClaimVM(vm->GetName(), cmd);
247 if (claim != 0) {
248 if ( !GetVmInfo(vm->GetName(), claim->fImage, claim->fPerfIdx) ) {
249 // assume vm is gone
250 delete claim;
251 } else {
252 fClaims->Add(claim);
253 fState = kActive;
254 }
255 }
256 }
257
258 return fClaims;
259}
260
261
262////////////////////////////////////////////////////////////////////////////////
263/// Claim virtual machine with name vmname
264/// This function does not figure out the image and performance index before
265/// returning the condor slave
266
267TCondorSlave *TCondor::Claim(const char *vmname, const char *cmd)
268{
269 if (fState != kFree && fState != kActive) {
270 Error("Claim","not in state Free or Active");
271 return 0;
272 }
273
274 TCondorSlave *claim = ClaimVM(vmname, cmd);
275 if (claim != 0) {
276 fClaims->Add(claim);
277 fState = kActive;
278 }
279
280 return claim;
281}
282
283
284////////////////////////////////////////////////////////////////////////////////
285/// Set the state of workers
286
288{
289 PDB(kCondor,1) Info("SetState","state: %s (%lld)",
290 state == kSuspended ? "kSuspended" : "kActive", Long64_t(gSystem->Now()));
291 TIter next(fClaims);
292 TCondorSlave *claim;
293 while((claim = (TCondorSlave*) next()) != 0) {
294 TString cmd = Form("condor_cod %s -id '%s'",
295 state == kSuspended ? "suspend" : "resume",
296 claim->fClaimID.Data());
297
298 PDB(kCondor,2) Info("SetState","command: %s", cmd.Data());
299 FILE *pipe = gSystem->OpenPipe(cmd, "r");
300
301 if (!pipe) {
302 SysError("SetState","cannot run command: %s", cmd.Data());
303 return kFALSE;
304 }
305
307 while (line.Gets(pipe)) {
308 PDB(kCondor,3) Info("SetState","line = %s", line.Data());
309 }
310
311 Int_t r = gSystem->ClosePipe(pipe);
312 if (r) {
313 Error("SetState","command: %s returned %d", cmd.Data(), r);
314 return kFALSE;
315 } else {
316 PDB(kCondor,1) Info("SetState","command: %s returned %d", cmd.Data(), r);
317 }
318 }
319
320 fState = state;
321 return kTRUE;
322}
323
324
325////////////////////////////////////////////////////////////////////////////////
326/// Suspend worker
327
329{
330 if (fState != kActive) {
331 Error("Suspend","not in state Active");
332 return kFALSE;
333 }
334
335 return SetState(kSuspended);
336}
337
338
339////////////////////////////////////////////////////////////////////////////////
340/// Resume worker
341
343{
344 if (fState != kSuspended) {
345 Error("Suspend","not in state Suspended");
346 return kFALSE;
347 }
348
349 return SetState(kActive);
350}
351
352
353////////////////////////////////////////////////////////////////////////////////
354/// Release worker
355
357{
358 if (fState == kFree) {
359 Error("Suspend","not in state Active or Suspended");
360 return kFALSE;
361 }
362
363 TCondorSlave *claim;
364 while((claim = (TCondorSlave*) fClaims->First()) != 0) {
365 TString cmd = Form("condor_cod release -id '%s'", claim->fClaimID.Data());
366
367 PDB(kCondor,2) Info("SetState","command: %s", cmd.Data());
368 FILE *pipe = gSystem->OpenPipe(cmd, "r");
369
370 if (!pipe) {
371 SysError("Release","cannot run command: %s", cmd.Data());
372 return kFALSE;
373 }
374
376 while (line.Gets(pipe)) {
377 PDB(kCondor,3) Info("Release","line = %s", line.Data());
378 }
379
380 Int_t r = gSystem->ClosePipe(pipe);
381 if (r) {
382 Error("Release","command: %s returned %d", cmd.Data(), r);
383 return kFALSE;
384 } else {
385 PDB(kCondor,1) Info("Release","command: %s returned %d", cmd.Data(), r);
386 }
387
388 fClaims->Remove(claim);
389 delete claim;
390 }
391
392 fState = kFree;
393 return kTRUE;
394}
395
396
397////////////////////////////////////////////////////////////////////////////////
398/// Get info about worker status
399
400Bool_t TCondor::GetVmInfo(const char *vm, TString &image, Int_t &perfidx) const
401{
402 TString cmd = Form("condor_status -format \"%%d:\" Mips -format \"%%s\\n\" FileSystemDomain "
403 "-const 'Name==\"%s\"'", vm);
404
405 PDB(kCondor,2) Info("GetVmInfo","command: %s", cmd.Data());
406 FILE *pipe = gSystem->OpenPipe(cmd, "r");
407
408 if (!pipe) {
409 SysError("GetVmInfo","cannot run command: %s", cmd.Data());
410 return kFALSE;
411 }
412
414 while (line.Gets(pipe)) {
415 PDB(kCondor,3) Info("GetVmInfo","line = %s", line.Data());
416 if (line != "") {
417 TString amips = line(TRegexp("^[0-9]*"));
418 perfidx = atoi(amips);
419 image = line(TRegexp("[^:]+$"));
420 break;
421 }
422 }
423
424 Int_t r = gSystem->ClosePipe(pipe);
425 if (r) {
426 Error("GetVmInfo","command: %s returned %d", cmd.Data(), r);
427 return kFALSE;
428 } else {
429 PDB(kCondor,1) Info("GetVmInfo","command: %s returned %d", cmd.Data(), r);
430 }
431
432 return kTRUE;
433}
434
435
436////////////////////////////////////////////////////////////////////////////////
437/// Get image of the worker
438
439TString TCondor::GetImage(const char *host) const
440{
441 TString cmd = Form("condor_status -direct %s -format \"Image:%%s\\n\" "
442 "FileSystemDomain", host);
443
444 PDB(kCondor,2) Info("GetImage","command: %s", cmd.Data());
445
446 FILE *pipe = gSystem->OpenPipe(cmd, "r");
447
448 if (!pipe) {
449 SysError("GetImage","cannot run command: %s", cmd.Data());
450 return "";
451 }
452
453 TString image;
455 while (line.Gets(pipe)) {
456 PDB(kCondor,3) Info("GetImage","line = %s", line.Data());
457 if (line != "") {
458 image = line(TRegexp("[^:]+$"));
459 break;
460 }
461 }
462
463 Int_t r = gSystem->ClosePipe(pipe);
464 if (r) {
465 Error("GetImage","command: %s returned %d", cmd.Data(), r);
466 return "";
467 } else {
468 PDB(kCondor,1) Info("GetImage","command: %s returned %d", cmd.Data(), r);
469 }
470
471 return image;
472}
473
474
475////////////////////////////////////////////////////////////////////////////////
476/// Print worker status
477
478void TCondorSlave::Print(Option_t * /*opt*/ ) const
479{
480 std::cout << "OBJ: " << IsA()->GetName()
481 << " " << fHostname << ":" << fPort
482 << " Perf: " << fPerfIdx
483 << " Image: " << fImage << std::endl;
484}
ROOT::R::TRInterface & r
Definition: Object.C:4
int Int_t
Definition: RtypesCore.h:41
const Bool_t kFALSE
Definition: RtypesCore.h:88
bool Bool_t
Definition: RtypesCore.h:59
long long Long64_t
Definition: RtypesCore.h:69
const Bool_t kTRUE
Definition: RtypesCore.h:87
const char Option_t
Definition: RtypesCore.h:62
#define ClassImp(name)
Definition: Rtypes.h:363
R__EXTERN TEnv * gEnv
Definition: TEnv.h:171
#define PDB(mask, level)
Definition: TProofDebug.h:56
char * Form(const char *fmt,...)
@ kExecutePermission
Definition: TSystem.h:46
R__EXTERN TSystem * gSystem
Definition: TSystem.h:540
virtual void Print(Option_t *option="") const
Default print for collections, calls Print(option, 1).
void Print(Option_t *option="") const
Print worker status.
Definition: TCondor.cxx:478
Int_t fPerfIdx
Definition: TCondor.h:38
TString fClaimID
Definition: TCondor.h:40
TString fImage
Definition: TCondor.h:39
TString fHostname
Definition: TCondor.h:36
Int_t fPort
Definition: TCondor.h:37
Bool_t Suspend()
Suspend worker.
Definition: TCondor.cxx:328
Bool_t Release()
Release worker.
Definition: TCondor.cxx:356
Bool_t Resume()
Resume worker.
Definition: TCondor.cxx:342
void Print(Option_t *option="") const
Print master status.
Definition: TCondor.cxx:89
TList * Claim(Int_t n, const char *cmd)
Claim n virtual machines This function figures out the image and performance index before returning t...
Definition: TCondor.cxx:235
TCondor(const char *pool="")
Create Condor interface object.
Definition: TCondor.cxx:42
EState
Definition: TCondor.h:54
@ kActive
Definition: TCondor.h:54
@ kFree
Definition: TCondor.h:54
@ kSuspended
Definition: TCondor.h:54
TCondorSlave * ClaimVM(const char *vm, const char *cmd)
Claim a VirtualMachine for PROOF usage.
Definition: TCondor.cxx:101
virtual ~TCondor()
Cleanup Condor interface.
Definition: TCondor.cxx:75
Bool_t fValid
Definition: TCondor.h:58
TString GetImage(const char *host) const
Get image of the worker.
Definition: TCondor.cxx:439
EState fState
Definition: TCondor.h:60
Bool_t GetVmInfo(const char *vm, TString &image, Int_t &perfidx) const
Get info about worker status.
Definition: TCondor.cxx:400
TList * fClaims
Definition: TCondor.h:61
TList * GetVirtualMachines() const
Get the names of the virtual machines in the pool.
Definition: TCondor.cxx:196
TString fPool
Definition: TCondor.h:59
Bool_t SetState(EState state)
Set the state of workers.
Definition: TCondor.cxx:287
virtual Int_t GetValue(const char *name, Int_t dflt) const
Returns the integer value for a resource.
Definition: TEnv.cxx:491
A doubly linked list.
Definition: TList.h:44
virtual void Add(TObject *obj)
Definition: TList.h:87
virtual TObject * Remove(TObject *obj)
Remove object from the list.
Definition: TList.cxx:818
virtual TObject * First() const
Return the first object in the list. Returns 0 when list is empty.
Definition: TList.cxx:655
Collectable string class.
Definition: TObjString.h:28
const char * GetName() const
Returns name of object.
Definition: TObjString.h:39
virtual void SysError(const char *method, const char *msgfmt,...) const
Issue system error message.
Definition: TObject.cxx:894
virtual void Error(const char *method, const char *msgfmt,...) const
Issue error message.
Definition: TObject.cxx:880
virtual void Info(const char *method, const char *msgfmt,...) const
Issue info message.
Definition: TObject.cxx:854
Regular expression class.
Definition: TRegexp.h:31
Basic string class.
Definition: TString.h:131
const char * Data() const
Definition: TString.h:364
TString & ReplaceAll(const TString &s1, const TString &s2)
Definition: TString.h:687
TString & Remove(Ssiz_t pos)
Definition: TString.h:668
Ssiz_t Index(const char *pat, Ssiz_t i=0, ECaseCompare cmp=kExact) const
Definition: TString.h:634
virtual const char * Getenv(const char *env)
Get environment variable.
Definition: TSystem.cxx:1652
virtual TTime Now()
Get current time in milliseconds since 0:00 Jan 1 1995.
Definition: TSystem.cxx:472
virtual FILE * OpenPipe(const char *command, const char *mode)
Open a pipe.
Definition: TSystem.cxx:671
virtual FILE * TempFileName(TString &base, const char *dir=0)
Create a secure temporary file by appending a unique 6 letter string to base.
Definition: TSystem.cxx:1487
virtual int ClosePipe(FILE *pipe)
Close the pipe.
Definition: TSystem.cxx:680
virtual char * Which(const char *search, const char *file, EAccessMode mode=kFileExists)
Find location of file in a search path.
Definition: TSystem.cxx:1536
virtual Int_t GetUid(const char *user=0)
Returns the user's id. If user = 0, returns current user's id.
Definition: TSystem.cxx:1549
virtual void Setenv(const char *name, const char *value)
Set environment variable.
Definition: TSystem.cxx:1636
virtual int Unlink(const char *name)
Unlink, i.e.
Definition: TSystem.cxx:1371
virtual const char * TempDirectory() const
Return a user configured or systemwide directory to create temporary files in.
Definition: TSystem.cxx:1472
TLine * line
const Int_t n
Definition: legend1.C:16
auto * l
Definition: textangle.C:4