Logo ROOT   6.18/05
Reference Guide
TCondor.cxx
Go to the documentation of this file.
1// @(#)root/proof:$Id$
2// Author: Maarten Ballintijn 06/12/03
3
4/*************************************************************************
5 * Copyright (C) 1995-2001, Rene Brun and Fons Rademakers. *
6 * All rights reserved. *
7 * *
8 * For the licensing terms see $ROOTSYS/LICENSE. *
9 * For the list of contributors see $ROOTSYS/README/CREDITS. *
10 *************************************************************************/
11
12//////////////////////////////////////////////////////////////////////////
13// //
14// TCondor //
15// //
16// Interface to the Condor system. TCondor provides a (partial) API for //
17// querying and controlling the Condor system, including experimental //
18// extensions like COD (computing on demand) //
19// //
20//////////////////////////////////////////////////////////////////////////
21
22#include <stdlib.h>
23
24#include "TCondor.h"
25#include "TList.h"
26#include "TSystem.h"
27#include "TObjString.h"
28#include "TRegexp.h"
29#include "TProofDebug.h"
30#include "Riostream.h"
31#include "TEnv.h"
32#include "TClass.h"
33
36
37
38////////////////////////////////////////////////////////////////////////////////
39/// Create Condor interface object. Uses Condor apps since there is no
40/// API yet.
41
42TCondor::TCondor(const char *pool) : fPool(pool), fState(kFree)
43{
44 fClaims = new TList;
45
46 // Setup Condor
47
48 TString condorHome = gEnv->GetValue("Proof.CondorHome", (char*)0);
49 if (condorHome != "") {
50 TString path = gSystem->Getenv("PATH");
51 path = condorHome + "/bin:" + path;
52 gSystem->Setenv("PATH",path);
53 }
54
55 TString condorConf = gEnv->GetValue("Proof.CondorConfig", (char*)0);
56 if (condorConf != "") {
57 gSystem->Setenv("CONDOR_CONFIG",condorConf);
58 }
59
60 char *loc = gSystem->Which(gSystem->Getenv("PATH"), "condor_cod",
62
63 if (loc) {
64 fValid = kTRUE;
65 delete [] loc;
66 } else {
67 fValid = kFALSE;
68 }
69}
70
71
72////////////////////////////////////////////////////////////////////////////////
73/// Cleanup Condor interface.
74
76{
77 PDB(kCondor,1) Info("~TCondor","fState %d", fState );
78
79 if (fState != kFree) {
80 Release();
81 }
82 delete fClaims;
83}
84
85
86////////////////////////////////////////////////////////////////////////////////
87/// Print master status
88
89void TCondor::Print(Option_t * opt) const
90{
91 std::cout << "OBJ: " << IsA()->GetName()
92 << "\tPool: \"" << fPool << "\""
93 << "\tState: " << fState << std::endl;
94 fClaims->Print(opt);
95}
96
97
98////////////////////////////////////////////////////////////////////////////////
99/// Claim a VirtualMachine for PROOF usage.
100
101TCondorSlave *TCondor::ClaimVM(const char *vm, const char *cmd)
102{
103 Int_t port = 0;
104
105 TString claimCmd = Form("condor_cod request -name %s -timeout 10 2>>%s/condor.proof.%d",
106 vm, gSystem->TempDirectory(), gSystem->GetUid() );
107
108 PDB(kCondor,2) Info("ClaimVM","command: %s", claimCmd.Data());
109 FILE *pipe = gSystem->OpenPipe(claimCmd, "r");
110
111 if (!pipe) {
112 SysError("ClaimVM","cannot run command: %s", claimCmd.Data());
113 return 0;
114 }
115
116 TString claimId;
118 while (line.Gets(pipe)) {
119 PDB(kCondor,3) Info("ClaimVM","line = %s", line.Data());
120
121 if (line.BeginsWith("ClaimId = \"")) {
122 line.Remove(0, line.Index("\"")+1);
123 line.Chop(); // remove trailing "
124 claimId = line;
125 PDB(kCondor,1) Info("ClaimVM","claim = '%s'", claimId.Data());
126 TRegexp r("[0-9]*$");
127 TString num = line(r);
128 port = 37000 + atoi(num.Data());
129 PDB(kCondor,1) Info("ClaimVM","port = %d", port);
130 }
131 }
132
133 Int_t r = gSystem->ClosePipe(pipe);
134 if (r) {
135 Error("ClaimVM","command: %s returned %d", claimCmd.Data(), r);
136 return 0;
137 } else {
138 PDB(kCondor,1) Info("ClaimVM","command: %s returned %d", claimCmd.Data(), r);
139 }
140
141 TString jobad("jobad");
142 FILE *jf = gSystem->TempFileName(jobad);
143
144 if (jf == 0) return 0;
145
146 TString str(cmd);
147 str.ReplaceAll("$(Port)", Form("%d", port));
148 fputs(str, jf);
149
150 fclose(jf);
151
152 TString activateCmd = Form("condor_cod activate -id '%s' -jobad %s",
153 claimId.Data(), jobad.Data() );
154
155 PDB(kCondor,2) Info("ClaimVM","command: %s", activateCmd.Data());
156 pipe = gSystem->OpenPipe(activateCmd, "r");
157
158 if (!pipe) {
159 SysError("ClaimVM","cannot run command: %s", activateCmd.Data());
160 return 0;
161 }
162
163 while (line.Gets(pipe)) {
164 PDB(kCondor,3) Info("ClaimVM","Activate: line = %s", line.Data());
165 }
166
167 r = gSystem->ClosePipe(pipe);
168 if (r) {
169 Error("ClaimVM","command: %s returned %d", activateCmd.Data(), r);
170 } else {
171 PDB(kCondor,1) Info("ClaimVM","command: %s returned %d", activateCmd.Data(), r);
172 }
173
174 gSystem->Unlink(jobad);
175
176 // TODO: get info at the start for all nodes ...
177 TCondorSlave *claim = new TCondorSlave;
178 claim->fClaimID = claimId;
179 TString node(vm);
180 node = node.Remove(0, node.Index("@")+1);
181 claim->fHostname = node;
182 claim->fPort = port;
183 claim->fPerfIdx = 100; //set performance index to 100 by default
184 claim->fImage = node; //set image to hostname by default
185
186 return claim;
187}
188
189
190////////////////////////////////////////////////////////////////////////////////
191/// Get the names of the virtual machines in the pool.
192/// Return a TList of TObjString or 0 in case of failure
193
195{
196 TString poolopt = fPool ? Form("-pool %s", fPool.Data()) : "";
197 TString cmd = Form("condor_status %s -format \"%%s\\n\" Name", poolopt.Data());
198
199 PDB(kCondor,2) Info("GetVirtualMachines","command: %s", cmd.Data());
200
201 FILE *pipe = gSystem->OpenPipe(cmd, "r");
202
203 if (!pipe) {
204 SysError("GetVirtualMachines","cannot run command: %s", cmd.Data());
205 return 0;
206 }
207
209 TList *l = new TList;
210 while (line.Gets(pipe)) {
211 PDB(kCondor,3) Info("GetVirtualMachines","line = %s", line.Data());
212 if (line != "") l->Add(new TObjString(line));
213 }
214
215 Int_t r = gSystem->ClosePipe(pipe);
216 if (r) {
217 delete l;
218 Error("GetVirtualMachines","command: %s returned %d", cmd.Data(), r);
219 return 0;
220 } else {
221 PDB(kCondor,1) Info("GetVirtualMachines","command: %s returned %d", cmd.Data(), r);
222 }
223
224 return l;
225}
226
227
228////////////////////////////////////////////////////////////////////////////////
229/// Claim n virtual machines
230/// This function figures out the image and performance index before returning
231/// the list of condor slaves
232
233TList *TCondor::Claim(Int_t n, const char *cmd)
234{
235 if (fState != kFree) {
236 Error("Claim","not in state Free");
237 return 0;
238 }
239
240 TList *vms = GetVirtualMachines();
241 TIter next(vms);
242 TObjString *vm;
243 for(Int_t i=0; i < n && (vm = (TObjString*) next()) != 0; i++ ) {
244 TCondorSlave *claim = ClaimVM(vm->GetName(), cmd);
245 if (claim != 0) {
246 if ( !GetVmInfo(vm->GetName(), claim->fImage, claim->fPerfIdx) ) {
247 // assume vm is gone
248 delete claim;
249 } else {
250 fClaims->Add(claim);
251 fState = kActive;
252 }
253 }
254 }
255
256 return fClaims;
257}
258
259
260////////////////////////////////////////////////////////////////////////////////
261/// Claim virtual machine with name vmname
262/// This function does not figure out the image and performance index before
263/// returning the condor slave
264
265TCondorSlave *TCondor::Claim(const char *vmname, const char *cmd)
266{
267 if (fState != kFree && fState != kActive) {
268 Error("Claim","not in state Free or Active");
269 return 0;
270 }
271
272 TCondorSlave *claim = ClaimVM(vmname, cmd);
273 if (claim != 0) {
274 fClaims->Add(claim);
275 fState = kActive;
276 }
277
278 return claim;
279}
280
281
282////////////////////////////////////////////////////////////////////////////////
283/// Set the state of workers
284
286{
287 PDB(kCondor,1) Info("SetState","state: %s (%lld)",
288 state == kSuspended ? "kSuspended" : "kActive", Long64_t(gSystem->Now()));
289 TIter next(fClaims);
290 TCondorSlave *claim;
291 while((claim = (TCondorSlave*) next()) != 0) {
292 TString cmd = Form("condor_cod %s -id '%s'",
293 state == kSuspended ? "suspend" : "resume",
294 claim->fClaimID.Data());
295
296 PDB(kCondor,2) Info("SetState","command: %s", cmd.Data());
297 FILE *pipe = gSystem->OpenPipe(cmd, "r");
298
299 if (!pipe) {
300 SysError("SetState","cannot run command: %s", cmd.Data());
301 return kFALSE;
302 }
303
305 while (line.Gets(pipe)) {
306 PDB(kCondor,3) Info("SetState","line = %s", line.Data());
307 }
308
309 Int_t r = gSystem->ClosePipe(pipe);
310 if (r) {
311 Error("SetState","command: %s returned %d", cmd.Data(), r);
312 return kFALSE;
313 } else {
314 PDB(kCondor,1) Info("SetState","command: %s returned %d", cmd.Data(), r);
315 }
316 }
317
318 fState = state;
319 return kTRUE;
320}
321
322
323////////////////////////////////////////////////////////////////////////////////
324/// Suspend worker
325
327{
328 if (fState != kActive) {
329 Error("Suspend","not in state Active");
330 return kFALSE;
331 }
332
333 return SetState(kSuspended);
334}
335
336
337////////////////////////////////////////////////////////////////////////////////
338/// Resume worker
339
341{
342 if (fState != kSuspended) {
343 Error("Suspend","not in state Suspended");
344 return kFALSE;
345 }
346
347 return SetState(kActive);
348}
349
350
351////////////////////////////////////////////////////////////////////////////////
352/// Release worker
353
355{
356 if (fState == kFree) {
357 Error("Suspend","not in state Active or Suspended");
358 return kFALSE;
359 }
360
361 TCondorSlave *claim;
362 while((claim = (TCondorSlave*) fClaims->First()) != 0) {
363 TString cmd = Form("condor_cod release -id '%s'", claim->fClaimID.Data());
364
365 PDB(kCondor,2) Info("SetState","command: %s", cmd.Data());
366 FILE *pipe = gSystem->OpenPipe(cmd, "r");
367
368 if (!pipe) {
369 SysError("Release","cannot run command: %s", cmd.Data());
370 return kFALSE;
371 }
372
374 while (line.Gets(pipe)) {
375 PDB(kCondor,3) Info("Release","line = %s", line.Data());
376 }
377
378 Int_t r = gSystem->ClosePipe(pipe);
379 if (r) {
380 Error("Release","command: %s returned %d", cmd.Data(), r);
381 return kFALSE;
382 } else {
383 PDB(kCondor,1) Info("Release","command: %s returned %d", cmd.Data(), r);
384 }
385
386 fClaims->Remove(claim);
387 delete claim;
388 }
389
390 fState = kFree;
391 return kTRUE;
392}
393
394
395////////////////////////////////////////////////////////////////////////////////
396/// Get info about worker status
397
398Bool_t TCondor::GetVmInfo(const char *vm, TString &image, Int_t &perfidx) const
399{
400 TString cmd = Form("condor_status -format \"%%d:\" Mips -format \"%%s\\n\" FileSystemDomain "
401 "-const 'Name==\"%s\"'", vm);
402
403 PDB(kCondor,2) Info("GetVmInfo","command: %s", cmd.Data());
404 FILE *pipe = gSystem->OpenPipe(cmd, "r");
405
406 if (!pipe) {
407 SysError("GetVmInfo","cannot run command: %s", cmd.Data());
408 return kFALSE;
409 }
410
412 while (line.Gets(pipe)) {
413 PDB(kCondor,3) Info("GetVmInfo","line = %s", line.Data());
414 if (line != "") {
415 TString amips = line(TRegexp("^[0-9]*"));
416 perfidx = atoi(amips);
417 image = line(TRegexp("[^:]+$"));
418 break;
419 }
420 }
421
422 Int_t r = gSystem->ClosePipe(pipe);
423 if (r) {
424 Error("GetVmInfo","command: %s returned %d", cmd.Data(), r);
425 return kFALSE;
426 } else {
427 PDB(kCondor,1) Info("GetVmInfo","command: %s returned %d", cmd.Data(), r);
428 }
429
430 return kTRUE;
431}
432
433
434////////////////////////////////////////////////////////////////////////////////
435/// Get image of the worker
436
437TString TCondor::GetImage(const char *host) const
438{
439 TString cmd = Form("condor_status -direct %s -format \"Image:%%s\\n\" "
440 "FileSystemDomain", host);
441
442 PDB(kCondor,2) Info("GetImage","command: %s", cmd.Data());
443
444 FILE *pipe = gSystem->OpenPipe(cmd, "r");
445
446 if (!pipe) {
447 SysError("GetImage","cannot run command: %s", cmd.Data());
448 return "";
449 }
450
451 TString image;
453 while (line.Gets(pipe)) {
454 PDB(kCondor,3) Info("GetImage","line = %s", line.Data());
455 if (line != "") {
456 image = line(TRegexp("[^:]+$"));
457 break;
458 }
459 }
460
461 Int_t r = gSystem->ClosePipe(pipe);
462 if (r) {
463 Error("GetImage","command: %s returned %d", cmd.Data(), r);
464 return "";
465 } else {
466 PDB(kCondor,1) Info("GetImage","command: %s returned %d", cmd.Data(), r);
467 }
468
469 return image;
470}
471
472
473////////////////////////////////////////////////////////////////////////////////
474/// Print worker status
475
476void TCondorSlave::Print(Option_t * /*opt*/ ) const
477{
478 std::cout << "OBJ: " << IsA()->GetName()
479 << " " << fHostname << ":" << fPort
480 << " Perf: " << fPerfIdx
481 << " Image: " << fImage << std::endl;
482}
ROOT::R::TRInterface & r
Definition: Object.C:4
int Int_t
Definition: RtypesCore.h:41
const Bool_t kFALSE
Definition: RtypesCore.h:88
bool Bool_t
Definition: RtypesCore.h:59
long long Long64_t
Definition: RtypesCore.h:69
const Bool_t kTRUE
Definition: RtypesCore.h:87
const char Option_t
Definition: RtypesCore.h:62
#define ClassImp(name)
Definition: Rtypes.h:365
R__EXTERN TEnv * gEnv
Definition: TEnv.h:171
#define PDB(mask, level)
Definition: TProofDebug.h:56
char * Form(const char *fmt,...)
@ kExecutePermission
Definition: TSystem.h:46
R__EXTERN TSystem * gSystem
Definition: TSystem.h:560
virtual void Print(Option_t *option="") const
Default print for collections, calls Print(option, 1).
void Print(Option_t *option="") const
Print worker status.
Definition: TCondor.cxx:476
Int_t fPerfIdx
Definition: TCondor.h:38
TString fClaimID
Definition: TCondor.h:40
TString fImage
Definition: TCondor.h:39
TString fHostname
Definition: TCondor.h:36
Int_t fPort
Definition: TCondor.h:37
Bool_t Suspend()
Suspend worker.
Definition: TCondor.cxx:326
Bool_t Release()
Release worker.
Definition: TCondor.cxx:354
Bool_t Resume()
Resume worker.
Definition: TCondor.cxx:340
void Print(Option_t *option="") const
Print master status.
Definition: TCondor.cxx:89
TList * Claim(Int_t n, const char *cmd)
Claim n virtual machines This function figures out the image and performance index before returning t...
Definition: TCondor.cxx:233
TCondor(const char *pool="")
Create Condor interface object.
Definition: TCondor.cxx:42
EState
Definition: TCondor.h:54
@ kActive
Definition: TCondor.h:54
@ kFree
Definition: TCondor.h:54
@ kSuspended
Definition: TCondor.h:54
TCondorSlave * ClaimVM(const char *vm, const char *cmd)
Claim a VirtualMachine for PROOF usage.
Definition: TCondor.cxx:101
virtual ~TCondor()
Cleanup Condor interface.
Definition: TCondor.cxx:75
Bool_t fValid
Definition: TCondor.h:58
TString GetImage(const char *host) const
Get image of the worker.
Definition: TCondor.cxx:437
EState fState
Definition: TCondor.h:60
Bool_t GetVmInfo(const char *vm, TString &image, Int_t &perfidx) const
Get info about worker status.
Definition: TCondor.cxx:398
TList * fClaims
Definition: TCondor.h:61
TList * GetVirtualMachines() const
Get the names of the virtual machines in the pool.
Definition: TCondor.cxx:194
TString fPool
Definition: TCondor.h:59
Bool_t SetState(EState state)
Set the state of workers.
Definition: TCondor.cxx:285
virtual Int_t GetValue(const char *name, Int_t dflt) const
Returns the integer value for a resource.
Definition: TEnv.cxx:491
A doubly linked list.
Definition: TList.h:44
virtual void Add(TObject *obj)
Definition: TList.h:87
virtual TObject * Remove(TObject *obj)
Remove object from the list.
Definition: TList.cxx:819
virtual TObject * First() const
Return the first object in the list. Returns 0 when list is empty.
Definition: TList.cxx:656
Collectable string class.
Definition: TObjString.h:28
const char * GetName() const
Returns name of object.
Definition: TObjString.h:38
virtual void SysError(const char *method, const char *msgfmt,...) const
Issue system error message.
Definition: TObject.cxx:894
virtual void Error(const char *method, const char *msgfmt,...) const
Issue error message.
Definition: TObject.cxx:880
virtual void Info(const char *method, const char *msgfmt,...) const
Issue info message.
Definition: TObject.cxx:854
Regular expression class.
Definition: TRegexp.h:31
Basic string class.
Definition: TString.h:131
const char * Data() const
Definition: TString.h:364
TString & ReplaceAll(const TString &s1, const TString &s2)
Definition: TString.h:687
TString & Remove(Ssiz_t pos)
Definition: TString.h:668
Ssiz_t Index(const char *pat, Ssiz_t i=0, ECaseCompare cmp=kExact) const
Definition: TString.h:634
virtual const char * Getenv(const char *env)
Get environment variable.
Definition: TSystem.cxx:1652
virtual TTime Now()
Get current time in milliseconds since 0:00 Jan 1 1995.
Definition: TSystem.cxx:472
virtual FILE * OpenPipe(const char *command, const char *mode)
Open a pipe.
Definition: TSystem.cxx:671
virtual FILE * TempFileName(TString &base, const char *dir=0)
Create a secure temporary file by appending a unique 6 letter string to base.
Definition: TSystem.cxx:1487
virtual int ClosePipe(FILE *pipe)
Close the pipe.
Definition: TSystem.cxx:680
virtual char * Which(const char *search, const char *file, EAccessMode mode=kFileExists)
Find location of file in a search path.
Definition: TSystem.cxx:1536
virtual Int_t GetUid(const char *user=0)
Returns the user's id. If user = 0, returns current user's id.
Definition: TSystem.cxx:1549
virtual void Setenv(const char *name, const char *value)
Set environment variable.
Definition: TSystem.cxx:1636
virtual int Unlink(const char *name)
Unlink, i.e.
Definition: TSystem.cxx:1371
virtual const char * TempDirectory() const
Return a user configured or systemwide directory to create temporary files in.
Definition: TSystem.cxx:1472
TLine * line
const Int_t n
Definition: legend1.C:16
auto * l
Definition: textangle.C:4